[{"data":1,"prerenderedAt":198},["ShallowReactive",2],{"DlFXI4Eibt_Bn9lrEZz1TYbHCWFZj3IvqwHQSEW-Exc":3,"-jHBOywffmjS1NnW1tTXwZ9bKGvH4tp-X5khqZa5fao":194},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"hot":39,"new":78,"banner":118,"data":143,"cache":193},[8,9,10],"Agent","OpenAI","LLM",[12,14,17,20,23,25,27,30,33,36],{"title":8,"total":13},39,{"title":15,"total":16},"Google",44,{"title":18,"total":19},"Nvidia",13,{"title":21,"total":22},"Claude",11,{"title":9,"total":24},35,{"title":10,"total":26},85,{"title":28,"total":29},"DeepSeek",9,{"title":31,"total":32},"OCR",1,{"title":34,"total":35},"Chat",7,{"title":37,"total":38},"Generator",116,[40,48,55,64,71],{"id":41,"publish_date":42,"is_original":4,"collection":5,"cover_url":43,"cover_url_1_1":44,"title":45,"summary":46,"author":47},557,"2022-04-29","article_res/cover/7a9b1375ed9bb298154981bae42b794d.jpeg","article_res/cover/afa281dd52bc0454e6735daa8e6b0706.jpeg","Translation and summary of Messari Report [2.8 Kristin Smith, Blockchain Association and Katie Haun, a16z]","We need unity and speed right now.","Translation",{"id":49,"publish_date":50,"is_original":4,"collection":5,"cover_url":51,"cover_url_1_1":52,"title":53,"summary":54,"author":47},531,"2022-05-25","article_res/cover/e8362057f8fa189594c60afdfaaeb6e5.jpeg","article_res/cover/8ea08d0d6fa7eee6b57ed4ec61b61ad6.jpeg","Decentralized Society: Finding Web3’s Soul / Decentralized Society: Finding the Soul of Web3 -7","Decentralization through Pluralism When analyzing ecosystems, it's desirable to measure how decentralized it is.",{"id":56,"publish_date":57,"is_original":32,"collection":58,"cover_url":59,"cover_url_1_1":60,"title":61,"summary":62,"author":63},127,"2024-11-14","#Google #AI Game #World Model #AI Story","article_res/cover/0233a875b7ec2debf59779e311547569.jpeg","article_res/cover/6ffddb6ae4914b3c699493311aa9f198.jpeg","Google Launches \"Unbounded\": A Generative Infinite Character Life Simulation Game","Unbounded: A Generative Infinite Game of Character Life Simulation","Renee's Entrepreneurial Journey",{"id":13,"publish_date":65,"is_original":32,"collection":66,"cover_url":67,"cover_url_1_1":68,"title":69,"summary":70,"author":63},"2025-02-14","#Deep Dive into LLMs #Andrej Karpathy #LLM #Tool Use #Hallucination","article_res/cover/11e858ad6b74dfa80f923d549b62855c.jpeg","article_res/cover/615e1b320f1fc163edc1d2d154a6de33.jpeg","Andrej Karpathy's in-depth explanation of LLM (Part 4): Hallucinations","hallucinations, tool use, knowledge/working memory",{"id":72,"publish_date":73,"is_original":4,"collection":5,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":47},579,"2022-04-07","article_res/cover/39387376ba28447af1eb40576b9df215.jpeg","article_res/cover/02727ede8551ed49901d0abe6d6305b7.jpeg","Messari Report Translation and Summary 【1-7 Surviving the Winter】","I’d be more cautious here: 10 year and 10 hour thinking only.",[79,87,95,103,111],{"id":80,"publish_date":81,"is_original":32,"collection":82,"cover_url":83,"cover_url_1_1":84,"title":85,"summary":86,"author":63},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":112,"publish_date":105,"is_original":32,"collection":113,"cover_url":114,"cover_url_1_1":115,"title":116,"summary":117,"author":63},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[119,127,135],{"id":120,"publish_date":121,"is_original":32,"collection":122,"cover_url":123,"cover_url_1_1":124,"title":125,"summary":126,"author":63},160,"2024-10-04","#Philosophy","article_res/cover/496990c49211e8b7f996b7d39c18168e.jpeg","article_res/cover/14dbaa1ade9cb4316d5829423a900362.jpeg","Time","The fungus of the morning does not know the waxing and waning of the moon, and the cicada does not know the seasons; this is a short life. To the south of the state of Chu there is a dark spirit which regards five hundred years as spring and five hundred years as autumn. In ancient times there was a great tree called the Ming which regarded eight thousand years as spring and eight thousand years as autumn; this is a long life.",{"id":128,"publish_date":129,"is_original":32,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":63},98,"2024-12-17","#AI Video Generator #Sora #Pika","article_res/cover/3b86e85d03fff4f356a3e4cf2bb329c9.jpeg","article_res/cover/5fa5c20ad0b40f8f544d257c0ef02938.jpeg","Pika 2.0 video generation officially released: effect comparison with Sora","今天，我们推出了Pika 2.0模型。卓越的文字对齐效果。惊人的视觉表现。还有✨场景成分✨",{"id":136,"publish_date":137,"is_original":32,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":63},71,"2025-01-14","#Nvidia #World Foundation Model #Cosmos #Physical AI #Embodied AI","article_res/cover/feddf8c832dfb45d28804291f6a42a9e.jpeg","article_res/cover/d6bc2f1186d96b78228c2283a17a3645.jpeg","NVIDIA's Cosmos World Model","Cosmos World Foundation Model Platform for Physical AI",[144,163,188],{"title":8,"items":145},[146,147,155],{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},{"id":148,"publish_date":149,"is_original":32,"collection":150,"cover_url":151,"cover_url_1_1":152,"title":153,"summary":154,"author":63},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":156,"publish_date":157,"is_original":32,"collection":158,"cover_url":159,"cover_url_1_1":160,"title":161,"summary":162,"author":63},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"title":9,"items":164},[165,172,180],{"id":166,"publish_date":157,"is_original":32,"collection":167,"cover_url":168,"cover_url_1_1":169,"title":170,"summary":171,"author":63},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":173,"publish_date":174,"is_original":4,"collection":175,"cover_url":176,"cover_url_1_1":177,"title":178,"summary":179,"author":63},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":181,"publish_date":182,"is_original":4,"collection":183,"cover_url":184,"cover_url_1_1":185,"title":186,"summary":187,"author":63},417,"2023-08-24","#OpenAI","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"title":10,"items":189},[190,191,192],{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},true,{"code":4,"msg":5,"data":195},{"id":96,"publish_date":97,"is_original":32,"collection":98,"articles_id":196,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63,"content":197},"PO2a6bU4504zuzmYFJN0FQ","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Csection style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">In reinforcement learning, there is an important branch called \"Reinforcement Learning from Human Feedback\" (RLHF), which is particularly adept at handling problem domains that are difficult to directly verify (unverifiable domains).\u003C/span>\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Most of what we have dealt with so far are \"verifiable domains,\" meaning that any answer within these domains can be easily compared to a standard answer. For example, if the standard answer is the number \"3,\" we can simply check whether the model's answer is \"3.\"\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100010453\" data-ratio=\"0.7777777777777778\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" type=\"block\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488127310.7923530940618493.png\">\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">We can even use a mechanism called an \"LLM judge\" to evaluate the model's answers by scoring them, which has proven sufficiently reliable in practice to automatically complete the grading process using the capabilities of large language models (LLMs).\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">However, the real challenge for RLHF lies in those areas where answers cannot be easily verified, known as \"unverifiable domains.\" For instance, creative writing tasks such as writing a joke about pelicans, composing a poem, or summarizing a passage. These tasks cannot simply be compared to a specific answer.\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-ratio=\"0.845360824742268\" data-s=\"300,640\" data-type=\"png\" data-w=\"776\" type=\"block\" data-imgfileid=\"100010459\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488125110.7583761342543061.png\">\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">For example, we ask the model to create a joke about pelicans:\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-ratio=\"0.30718336483931946\" data-s=\"300,640\" data-type=\"png\" data-w=\"1058\" type=\"block\" data-imgfileid=\"100010454\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488125270.2662308884299667.png\">\u003C/section>\u003Cul style='box-sizing: border-box;margin: 8px 0px;;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Why don't pelicans ever pay their own bills? Because they always \"beak\" (peek) and pass it on to others. Clearly, this joke isn't very successful.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Although the model can generate a large number of jokes, evaluating their quality is a challenge. Theoretically, we could have humans check and score each one individually, but the reinforcement learning process often involves tens of thousands of iterations, each producing hundreds or even thousands of samples, making it impossible for humans to check each one.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Therefore, Reinforcement Learning from Human Feedback (RLHF) was developed. By guiding the model with limited human feedback, it enables the model to learn how to produce high-quality outputs in similar unverifiable domains. This method greatly reduces the workload of human involvement while improving the model's performance in complex tasks such as creative writing and dialogue generation.\u003C/span>\u003C/p>\u003Ch2 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">How does RLHF work specifically?\u003C/span>\u003C/span>\u003C/h2>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">If we had unlimited human time, theoretically, we could continuously improve the model through direct human feedback. For example, we could do 1000 updates, evaluating 1000 prompts per update, and each prompt generating 1000 answers, requiring humans to assess 1 billion jokes in total. This is clearly impractical.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">To solve this problem, OpenAI (some of whose members later founded Anthropic) proposed a method of RLHF.\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-ratio=\"0.3436254980079681\" data-s=\"300,640\" data-type=\"png\" data-w=\"1004\" type=\"block\" data-imgfileid=\"100010458\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488126310.10750377830226276.png\">\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The core trick of this method lies in \"indirect guidance\": involving humans only to a limited extent, specifically by training an additional neural network called a \"reward model\" to simulate human scoring.\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-ratio=\"0.5589307411907655\" data-s=\"300,640\" data-type=\"jpeg\" data-w=\"823\" type=\"block\" data-imgfileid=\"100010456\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488125960.3188854267964385.jpeg\">\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">First, humans rank a small amount of generated content (from best to worst) instead of directly scoring it, since ranking is relatively easier. This ranking becomes the data for training the reward model.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Then, the reward model takes prompts (such as \"write a joke about pelicans\") and generated content as input, outputting a score between 0 and 1, representing an evaluation from worst to best.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">For example, if someone ranks five jokes, the reward model also scores these five jokes, and we use mathematical methods (defining a loss function) to align the reward model's scores with the human rankings. When the model's scores do not align with human rankings, we adjust the reward model through supervision to reduce this gap, gradually making the scores closer to human evaluations.\u003C/span>\u003C/p>\u003Csection style=\"text-align: center;\" nodeleaf=\"\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-ratio=\"0.8414634146341463\" data-s=\"300,640\" data-type=\"png\" data-w=\"820\" type=\"block\" data-imgfileid=\"100010457\" style=\"height: auto !important;\" src=\"https://res.cooltool.vip/article_res/assets/17433488124780.03707609327349548.png\">\u003C/section>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Through this method, we can efficiently perform a large number of automatic evaluations, significantly expanding the application scope of reinforcement learning. Although this simulator is not perfect, as long as its scores are statistically close enough to human judgments, the practical application effects will be significantly improved.\u003C/span>\u003C/p>\u003Ch2 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">What are the advantages of RLHF?\u003C/span>\u003C/span>\u003C/h2>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The main advantage of Reinforcement Learning from Human Feedback is:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">It allows the application of reinforcement learning in any domain (including unverifiable domains), such as writing poetry, jokes, or summarizing content.\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">This method has been proven to significantly enhance model performance, possibly related to the \"gap between distinguishing and generating difficulty.\"\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">By having humans perform simpler tasks (such as ranking rather than direct creation), we obtain more accurate and reliable feedback data, thereby enhancing model performance.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Ch2 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">What are the limitations of RLHF?\u003C/span>\u003C/span>\u003C/h2>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Despite the obvious advantages of RLHF, there are also some significant limitations:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Simulation error\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: The reward model is merely an approximate simulation of human feedback and may not perfectly reflect real human judgment. It might be misleading.\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Adversarial sample problem\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Since the reward model is a complex neural network, reinforcement learning may discover special inputs (adversarial samples) that receive high scores but are actually meaningless. This situation is called \"gaming the model.\"\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;;\">\u003Csection style=\"box-sizing: border-box;;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Adversarial\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Unlike verifiable domains (such as Go), RLHF cannot be optimized indefinitely; otherwise, the model is easily misled and produces absurd outputs. Initially, it improves, but then dramatically falls off a cliff, resulting in very absurd outcomes, like giving \"the the the the the the the the\" when asked to write a joke. Typically, optimization must be stopped after a certain number of iterations to prevent the reward model from being completely misled.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Therefore, although RLHF can effectively improve model performance, it is more suitable for limited fine-tuning tasks rather than infinite optimization reinforcement learning tasks.\u003C/span>\u003C/p>\u003Ch2 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">The relationship between RLHF and RL\u003C/span>\u003C/span>\u003C/h2>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">In the field of RLHF, although this method is closely related to traditional reinforcement learning (RL), there are essential differences. Simply put, although RLHF belongs to reinforcement learning, it is not traditional RL because it cannot optimize without limits.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">In traditional reinforcement learning scenarios, such as Go, we can clearly determine wins and losses, possessing a perfect simulator, allowing reinforcement learning to continuously optimize and eventually surpass human performance. However, in RLHF, we use a reward model (reward model) as a simulator of human feedback. Fundamentally, the RLHF model is just a complex neural network, and it has errors in mimicking human scoring, making it susceptible to deception (i.e., generating adversarial samples).\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Therefore, although RLHF can effectively improve model performance through indirect human feedback, it is more akin to a limited \"fine-tuning\" rather than true infinite-optimization reinforcement learning. This approach can slightly improve model performance, but there is no \"magic\" of unlimited enhancement through increased computational resources and continuous optimization.\u003C/span>\u003C/p>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",1752585421978]