[{"data":1,"prerenderedAt":205},["ShallowReactive",2],{"DlFXI4Eibt_Bn9lrEZz1TYbHCWFZj3IvqwHQSEW-Exc":3,"VItbWfyEbhUH6ctoR6fgKq2CaF8ii66oWBtlDDQUQJY":194},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"hot":39,"new":78,"banner":118,"data":143,"cache":193},[8,9,10],"Agent","OpenAI","LLM",[12,14,17,20,23,25,27,30,33,36],{"title":8,"total":13},39,{"title":15,"total":16},"Google",44,{"title":18,"total":19},"Nvidia",13,{"title":21,"total":22},"Claude",11,{"title":9,"total":24},35,{"title":10,"total":26},85,{"title":28,"total":29},"DeepSeek",9,{"title":31,"total":32},"OCR",1,{"title":34,"total":35},"Chat",7,{"title":37,"total":38},"Generator",116,[40,48,55,64,71],{"id":41,"publish_date":42,"is_original":4,"collection":5,"cover_url":43,"cover_url_1_1":44,"title":45,"summary":46,"author":47},557,"2022-04-29","article_res/cover/7a9b1375ed9bb298154981bae42b794d.jpeg","article_res/cover/afa281dd52bc0454e6735daa8e6b0706.jpeg","Translation and summary of Messari Report [2.8 Kristin Smith, Blockchain Association and Katie Haun, a16z]","We need unity and speed right now.","Translation",{"id":49,"publish_date":50,"is_original":4,"collection":5,"cover_url":51,"cover_url_1_1":52,"title":53,"summary":54,"author":47},531,"2022-05-25","article_res/cover/e8362057f8fa189594c60afdfaaeb6e5.jpeg","article_res/cover/8ea08d0d6fa7eee6b57ed4ec61b61ad6.jpeg","Decentralized Society: Finding Web3’s Soul / Decentralized Society: Finding the Soul of Web3 -7","Decentralization through Pluralism When analyzing ecosystems, it's desirable to measure how decentralized it is.",{"id":56,"publish_date":57,"is_original":32,"collection":58,"cover_url":59,"cover_url_1_1":60,"title":61,"summary":62,"author":63},127,"2024-11-14","#Google #AI Game #World Model #AI Story","article_res/cover/0233a875b7ec2debf59779e311547569.jpeg","article_res/cover/6ffddb6ae4914b3c699493311aa9f198.jpeg","Google Launches \"Unbounded\": A Generative Infinite Character Life Simulation Game","Unbounded: A Generative Infinite Game of Character Life Simulation","Renee's Entrepreneurial Journey",{"id":13,"publish_date":65,"is_original":32,"collection":66,"cover_url":67,"cover_url_1_1":68,"title":69,"summary":70,"author":63},"2025-02-14","#Deep Dive into LLMs #Andrej Karpathy #LLM #Tool Use #Hallucination","article_res/cover/11e858ad6b74dfa80f923d549b62855c.jpeg","article_res/cover/615e1b320f1fc163edc1d2d154a6de33.jpeg","Andrej Karpathy's in-depth explanation of LLM (Part 4): Hallucinations","hallucinations, tool use, knowledge/working memory",{"id":72,"publish_date":73,"is_original":4,"collection":5,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":47},579,"2022-04-07","article_res/cover/39387376ba28447af1eb40576b9df215.jpeg","article_res/cover/02727ede8551ed49901d0abe6d6305b7.jpeg","Messari Report Translation and Summary 【1-7 Surviving the Winter】","I’d be more cautious here: 10 year and 10 hour thinking only.",[79,87,95,103,111],{"id":80,"publish_date":81,"is_original":32,"collection":82,"cover_url":83,"cover_url_1_1":84,"title":85,"summary":86,"author":63},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":112,"publish_date":105,"is_original":32,"collection":113,"cover_url":114,"cover_url_1_1":115,"title":116,"summary":117,"author":63},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[119,127,135],{"id":120,"publish_date":121,"is_original":32,"collection":122,"cover_url":123,"cover_url_1_1":124,"title":125,"summary":126,"author":63},160,"2024-10-04","#Philosophy","article_res/cover/496990c49211e8b7f996b7d39c18168e.jpeg","article_res/cover/14dbaa1ade9cb4316d5829423a900362.jpeg","Time","The fungus of the morning does not know the waxing and waning of the moon, and the cicada does not know the seasons; this is a short life. To the south of the state of Chu there is a dark spirit which regards five hundred years as spring and five hundred years as autumn. In ancient times there was a great tree called the Ming which regarded eight thousand years as spring and eight thousand years as autumn; this is a long life.",{"id":128,"publish_date":129,"is_original":32,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":63},98,"2024-12-17","#AI Video Generator #Sora #Pika","article_res/cover/3b86e85d03fff4f356a3e4cf2bb329c9.jpeg","article_res/cover/5fa5c20ad0b40f8f544d257c0ef02938.jpeg","Pika 2.0 video generation officially released: effect comparison with Sora","今天，我们推出了Pika 2.0模型。卓越的文字对齐效果。惊人的视觉表现。还有✨场景成分✨",{"id":136,"publish_date":137,"is_original":32,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":63},71,"2025-01-14","#Nvidia #World Foundation Model #Cosmos #Physical AI #Embodied AI","article_res/cover/feddf8c832dfb45d28804291f6a42a9e.jpeg","article_res/cover/d6bc2f1186d96b78228c2283a17a3645.jpeg","NVIDIA's Cosmos World Model","Cosmos World Foundation Model Platform for Physical AI",[144,163,188],{"title":8,"items":145},[146,147,155],{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},{"id":148,"publish_date":149,"is_original":32,"collection":150,"cover_url":151,"cover_url_1_1":152,"title":153,"summary":154,"author":63},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":156,"publish_date":157,"is_original":32,"collection":158,"cover_url":159,"cover_url_1_1":160,"title":161,"summary":162,"author":63},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"title":9,"items":164},[165,172,180],{"id":166,"publish_date":157,"is_original":32,"collection":167,"cover_url":168,"cover_url_1_1":169,"title":170,"summary":171,"author":63},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":173,"publish_date":174,"is_original":4,"collection":175,"cover_url":176,"cover_url_1_1":177,"title":178,"summary":179,"author":63},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":181,"publish_date":182,"is_original":4,"collection":183,"cover_url":184,"cover_url_1_1":185,"title":186,"summary":187,"author":63},417,"2023-08-24","#OpenAI","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"title":10,"items":189},[190,191,192],{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},true,{"code":4,"msg":5,"data":195},{"id":196,"publish_date":197,"is_original":32,"collection":198,"articles_id":199,"cover_url":200,"cover_url_1_1":201,"title":202,"summary":203,"author":63,"content":204},313,"2024-03-17","#Devin","HlNvSX3nvtlT80i_uLhl_A","article_res/cover/69b67e82bee935655fe1fead4d60c448.jpeg","article_res/cover/54c109f5149a5ae4199d8b8ff7c6b6ca.jpeg","Devin, Claimed to Replace Junior Engineers","Meet Devin: The World's First AI Software Engineer.","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Four days ago, Cognition Lab released Devin, claiming it to be the first AI software engineer.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003887\" data-ratio=\"0.5342592592592592\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810927900.3280822010333022.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin is the latest state-of-the-art in SWE-Bench coding benchmarks, having successfully passed real engineering interviews at top AI companies and even completed real jobs on Upwork. Devin is an autonomous agent that solves engineering tasks by using its own shell, code editor, and web browser.\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"https://res.cooltool.vip/article_res/assets/17423810925380.35349483433298423.mp4\" poster=\"https://res.cooltool.vip/article_res/assets/17423810911200.9391020000040085.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Ch1 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 24px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Devin's capabilities\u003C/h1>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>With our progress in long-term reasoning and planning, Devin can plan and execute complex engineering tasks requiring thousands of decisions. Devin can recall relevant context at every step, learn over time, and correct mistakes.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>We also equipped Devin with common development tools, including a shell, code editor, and browser—all within a sandboxed computational environment—everything a human needs to get work done.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Finally, we gave Devin the ability to collaborate actively with users. Devin can report progress in real-time, accept feedback, and co-design choices with you when necessary.\u003C/p>\u003Ch1 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 24px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Some examples of what it can do\u003C/h1>\u003Cul data-tool=\"mdnice编辑器\" class=\"list-paddingleft-1\" style='margin-top: 8px;margin-bottom: 8px;padding-left: 25px;width: 537.461px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;'>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can learn how to use unfamiliar technologies. After reading a blog post, Devin ran ControlNet on Modal to generate images for Sara containing hidden messages.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can build and deploy applications end-to-end. Devin created an interactive website simulating Conway's Game of Life! It incrementally added features requested by users and then deployed the application to Netlify.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can independently find and fix bugs in code repositories. Devin helped Andrew maintain and debug his open-source competitive programming book.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can train and fine-tune its own AI models. Given only a link to a research repository on GitHub, Devin set up fine-tuning for a large language model.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can handle bugs and feature requests in open-source repositories. With just a link to a GitHub issue, Devin completed all necessary setup and context collection.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Devin can contribute to mature production repositories. This example is part of the SWE-bench benchmark. Devin solved an error in logarithmic calculations in the sympy Python algebra system. Devin set up the code environment, reproduced the bug, and independently wrote and tested a fix.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;color: rgb(1, 1, 1);\">Letting Devin take on real-world jobs on Upwork, it performs well! Here, Devin wrote and debugged code to run computer vision models.\u003C/section>\u003C/li>\u003C/ul>\u003Ch1 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 24px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Devin's performance\u003C/h1>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>We evaluated Devin on SWE-bench, a challenging benchmark that requires agents to solve real-world GitHub issues found in open-source projects like Django and scikit-learn.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin correctly solved 13.86% of the problems end-to-end, far surpassing the previous best of 1.96%. Even when given the exact files to edit, the best previous model could only solve 4.80% of the problems.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin was evaluated on a random 25% subset of the dataset. Devin operated without assistance, while all other models were assisted (meaning they were explicitly told which files needed editing).\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img\" data-galleryid=\"\" data-imgfileid=\"100003886\" data-ratio=\"0.5361111111111111\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810930730.7250989274949491.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>\u003Cspan style=\"font-size: 24px;font-weight: bold;\">SWE-bench technical report\u003C/span>\u003Cbr>\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;text-wrap: wrap;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;line-height: 26px;'>SWE-bench was used, an automated benchmark test for software engineering systems composed of GitHub issues and pull requests. SWE-bench deterministically evaluates a system's ability to solve real-world problems in codebases through unit tests, unlike benchmarks such as HumanEval, which are limited to standalone functions.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>SWE-bench is a dataset of 2,294 issues and pull requests scraped from popular open-source Python repositories on GitHub. Its goal is to test a system's ability to write real-world code. Each SWE-bench instance consists of a GitHub issue and the pull request that resolves it. The pull request must include a unit test that fails before the code changes and passes afterward (referred to as a \"fail-to-pass\" test). The diff is split into two parts: patch and test_patch, containing the code changes and test changes, respectively. The system is then asked to generate a diff based on the GitHub issue description and the repository's state at the time of the issue. If all unit tests pass after applying the modifications, the example is considered successful.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>In SWE-bench, LLMs are either given a set of correct files to edit (\"assisted\") or a separate system retrieves the files to edit based on the similarity of the issue text (\"unassisted\"). As an agent, Devin did not receive any file lists but navigated the files itself, making it more similar to an \"unassisted\" LLM. Solving SWE-bench examples correctly is challenging. More difficult PRs require changing dozens of files, maintaining backward compatibility, and/or performing extensive complex reasoning. Even with assistance, the best LLM achieved only a 4.80% success rate.\u003C/p>\u003Ch1 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 24px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Analysis\u003C/h1>\u003Ch2 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 22px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Multi-step planning\u003C/h2>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin can perform multi-step planning to obtain feedback from the environment. 72% of passing tests took over 10 minutes to complete, and the iterative capability helped Devin succeed.\u003C/p>\u003Ch2 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 22px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Qualitative examples\u003C/h2>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Example 1: ✅ scikit-learn__scikit-learn-10870\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003872\" data-ratio=\"0.8388888888888889\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810936460.044841928981623314.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin was initially misled by the description and literally added self.lower_bound_ = max_lower_bound, then returned self. This was actually incorrect because the variable had not been defined.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003873\" data-ratio=\"0.475\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810941160.21610990948972275.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>As provided by the test code in the problem description, Devin then updated the test file:\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003874\" data-ratio=\"0.3472222222222222\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810947280.2257918312598599.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>But after running the tests and receiving an error, Devin corrected the file:\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003875\" data-ratio=\"0.09259259259259259\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810928960.9506531238955205.png\">\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003876\" data-ratio=\"0.6861111111111111\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810951620.974760753701517.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>After completing this correction, Devin reran the tests to make them pass and exited successfully.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>This example has several interesting points:\u003C/p>\u003Cul class=\"list-paddingleft-1\" style=\"list-style-type: disc;\">\u003Cli>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin strictly followed the original instructions in the question, even though they were inaccurate. This indicates over-alignment with user preferences.\u003C/p>\u003C/li>\u003Cli>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin was able to run tests in its environment and correct errors. The ability to iterate is crucial for software developers, and agents should be able to do the same.\u003C/p>\u003C/li>\u003C/ul>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Example 2: ✅ django__django-10973\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003877\" data-ratio=\"0.6722222222222223\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810957020.6182003602266142.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin identified the correct file django/db/backends/postgresql/client.py and completed the full edit:\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003878\" data-ratio=\"0.6907407407407408\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810988320.8610597530602175.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Here, Devin successfully modified a large amount of code. Many successful edits in SWE-bench consist of single-line diffs, but Devin was able to handle multiple lines simultaneously.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Example 3: ❌ sympy__sympy-17313\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>This was a difficult task involving modifying a computer algebra system to correctly handle comparison operators on floor and ceiling objects that can be specified as positive or negative. This required complex logical reasoning and multiple derivation steps.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003879\" data-ratio=\"0.325\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810930610.9830692871553086.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin failed to edit the correct class, mistakenly editing the frac class instead of the floor and ceiling classes. Moreover, Devin only edited one comparison operator __gt__, when __lt__, __le__, and __ge__ needed modification. This edit was far from correct.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003880\" data-ratio=\"0.4962962962962963\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810946810.3862154583003399.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>The correct diff can be found here: https://github.com/sympy/sympy/pull/17313/files. The diff is quite complex, involving many edge case handling and extensive unit tests, requiring deep understanding of the sympy codebase. (Note: To pass an SWE-bench instance, every test must pass.)\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Example 4: ❌ scikit-learn__scikit-learn-10774\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>This task involved adding an extra return option functionality to all datasets in the repository. Devin successfully made such edits for several datasets; an example is shown below.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003881\" data-ratio=\"0.4703703703703704\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810946350.44692924654347466.png\">\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003885\" data-ratio=\"0.13240740740740742\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810939100.4253656307684446.png\">\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Devin successfully made similar edits for datasets california_housing.py, covtype.py, kddcup99.py, and mldata.py (these were actually excluded in the original PR). Unfortunately, Devin missed two datasets lfw.py and rcv1.py, so the final tests failed. The Cognition team plans to improve Devin's ability to edit multiple files.\u003C/p>\u003Ch2 data-tool=\"mdnice编辑器\" style='margin-top: 30px;margin-bottom: 15px;font-weight: bold;font-size: 22px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;letter-spacing: normal;text-align: left;text-wrap: wrap;'>Test-driven experiments\u003C/h2>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Cognition conducted an additional experiment where the final unit tests and problem statements were provided to Devin. In this \"test-driven development\" setting, the success rate increased to 23% (out of 100 sampled tests). (Note: Any changes to the tests themselves were erased before evaluation.)\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>Although the agent had access to the ground-truth test patch, this result is not comparable to other SWE-bench results. However, test-driven development is a common pattern in software engineering, so this setting is a natural extension of SWE-bench. A natural way for human engineers and agents to collaborate is to give the agent a target test to pass, expecting more test-driven agents in the future.\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>New examples solving test problems\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>✅ django__django-13321: Devin solved this by adding a print statement before the function, then running the unit test, and editing the file based on the printed results. The existence of test cases made it easier for Devin to debug.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003884\" data-ratio=\"0.21574074074074073\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810944950.023327632950617128.png\">\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100003882\" data-ratio=\"0.19166666666666668\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423810947370.1056789458372227.png\">\u003Cspan style='font-family: mp-quote, -apple-system-font, BlinkMacSystemFont, \"Helvetica Neue\", \"PingFang SC\", \"Hiragino Sans GB\", \"Microsoft YaHei UI\", \"Microsoft YaHei\", Arial, sans-serif;font-size: var(--articleFontsize);letter-spacing: 0.034em;'>\u003C/span>\u003C/p>\u003Cp data-tool=\"mdnice编辑器\" style='margin-bottom: 0px;padding-top: 8px;padding-bottom: 8px;color: black;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;font-size: 16px;letter-spacing: normal;text-align: left;text-wrap: wrap;line-height: 26px;'>✅ django__django-16983: The new unit test assertion emitted an exact error message: \"The value of 'filter_horizontal[0]' cannot include [...]\". Without the test patch, it would have been impossible to pass the test, highlighting an issue with the benchmark and indicating that perfect scores are impossible without test patches.\u003C/p>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",1752585474281]