<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Projects on 卓琪的开发笔记</title>
    <link>https://zhuoqidev.com/en/projects/</link>
    <description>Recent content in Projects on 卓琪的开发笔记</description>
    <generator>Hugo -- gohugo.io</generator>
    <language>zh-CN</language>
    <copyright>© 2026 Liu ZhuoQi</copyright>
    <lastBuildDate>Mon, 04 May 2026 00:00:00 +0000</lastBuildDate><atom:link href="https://zhuoqidev.com/en/projects/index.xml" rel="self" type="application/rss+xml" />
    
    <item>
      <title>Why LLMs Have No Memory — A Cross-Validated Research Report with 67 Primary Sources</title>
      <link>https://zhuoqidev.com/en/projects/llm-memory-research/</link>
      <pubDate>Mon, 04 May 2026 00:00:00 +0000</pubDate>
      
      <guid>https://zhuoqidev.com/en/projects/llm-memory-research/</guid>
      <description>&lt;h2 class=&#34;relative group&#34;&gt;1. Why LLMs Are Stateless&#xA;    &lt;div id=&#34;1-why-llms-are-stateless&#34; class=&#34;anchor&#34;&gt;&lt;/div&gt;&#xA;    &#xA;    &lt;span&#xA;        class=&#34;absolute top-0 w-6 transition-opacity opacity-0 -start-6 not-prose group-hover:opacity-100 select-none&#34;&gt;&#xA;        &lt;a class=&#34;text-primary-300 dark:text-neutral-700 !no-underline&#34; href=&#34;#1-why-llms-are-stateless&#34; aria-label=&#34;Anchor&#34;&gt;#&lt;/a&gt;&#xA;    &lt;/span&gt;&#xA;    &#xA;&lt;/h2&gt;&#xA;&lt;p&gt;Four independent constraints — individually manageable, together they leave &amp;ldquo;stateless&amp;rdquo; as the only viable engineering solution. This conclusion is cross-validated across 67 primary sources.&lt;/p&gt;&#xA;&#xA;&lt;h3 class=&#34;relative group&#34;&gt;Architecture: O(n²) Attention&#xA;    &lt;div id=&#34;architecture-on-attention&#34; class=&#34;anchor&#34;&gt;&lt;/div&gt;&#xA;    &#xA;    &lt;span&#xA;        class=&#34;absolute top-0 w-6 transition-opacity opacity-0 -start-6 not-prose group-hover:opacity-100 select-none&#34;&gt;&#xA;        &lt;a class=&#34;text-primary-300 dark:text-neutral-700 !no-underline&#34; href=&#34;#architecture-on-attention&#34; aria-label=&#34;Anchor&#34;&gt;#&lt;/a&gt;&#xA;    &lt;/span&gt;&#xA;    &#xA;&lt;/h3&gt;&#xA;&lt;p&gt;Self-attention scales at &lt;code&gt;O(n²)&lt;/code&gt;. A single 4096-token sequence needs &lt;del&gt;2 GB VRAM for KV cache; 32 concurrent sessions hit 64 GB — more than the model weights themselves. Llama 3.1 at 100M context requires 638 H100 GPUs (&lt;/del&gt;$5,400/hour) for KV cache alone.&lt;/p&gt;</description>
      
    </item>
    
  </channel>
</rss>
