[KVConnector] Remove v0-related kv connector components such as kv pipe and kv lookup buffer (#29705)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-12-06 15:04:47 +08:00 · 2025-12-05 02:20:48 +08:00
parent 652ba93da3
commit ece2825a29
13 changed files with 0 additions and 1624 deletions
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,160 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import random
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-# TODO: the test depends on a lot of fields in the current implementation.
-# We should have standard interface instead direct field access
-
-
-def test_run(my_rank, buffer, device):
-    # buffer should be empty in the beginning
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, device: {device}")
-
-    # insert
-    tokens = torch.tensor([1, 2, 3]).to(device)
-    roi = tokens > 0
-    if my_rank == 0:
-        key = 2.0 * torch.ones([5, 6]).to(device)
-        value = 3.0 * torch.ones([5, 6]).to(device)
-
-        placeholder = torch.tensor([1]).to(device)
-
-        buffer.insert(tokens, roi, key, value, placeholder)
-
-    torch.distributed.barrier()
-
-    # drop_select
-    if my_rank == 1:
-        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
-        assert torch.allclose(tokens, tok)
-        assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        assert buffer.buffer_size == 0
-        assert len(buffer.buffer) == 0
-
-    print(f"My rank: {my_rank}, Test run passed!")
-
-
-def stress_test(my_rank, buf, device):
-    torch.distributed.barrier()
-    torch.manual_seed(100)
-
-    reqs = [
-        (
-            torch.rand(100).to(device),  # tokens
-            torch.ones(100).bool().to(device),  # roi
-            torch.rand(100).to(device),  # key
-            torch.rand(100).to(device),  # value
-            torch.rand(100).to(device),  # hidden
-        )
-        for i in tqdm(range(200))
-    ]
-
-    random.seed(my_rank)
-    random.shuffle(reqs)
-
-    torch.distributed.barrier()
-
-    n = 0
-
-    # the buffer size can only store 100 reqs
-    # so the sender will occasionally block to wait for the receiver.
-    for req in tqdm(reqs):
-        if my_rank == 0:
-            buf.insert(*req)
-        else:
-            tok, roi, k, v, h = req
-            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
-
-            if tok_ is None:
-                assert roi_ is None
-                assert k_ is None
-                assert v_ is None
-                assert h_ is None
-                n += 1
-            else:
-                assert torch.allclose(tok, tok_)
-                assert torch.allclose(roi, roi_)
-                assert torch.allclose(k, k_)
-                assert torch.allclose(v, v_)
-                assert torch.allclose(h, h_)
-    print(f"Rank {my_rank} done")
-    torch.distributed.barrier()
-
-    if my_rank == 0:
-        x = torch.tensor([0])
-        torch.distributed.recv(x, 1)
-        # the # of None received is the kv that are not selected
-        assert x.item() == len(buf.buffer)
-        # and the size of the buffer should be 2000 * buffer len
-        print(buf.buffer_size)
-        assert buf.buffer_size == 1700 * len(buf.buffer)
-    else:
-        torch.distributed.send(torch.tensor([n]), 0)
-
-    print(f"My rank: {my_rank}, Passed stress test!")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    print(f"initialized! My rank is {my_rank}")
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    data_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cuda",
-        port_offset=0,
-    )
-    cpu_pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-        device="cpu",
-        port_offset=1,
-    )
-
-    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
-
-    test_run(my_rank, buffer, data_pipe.device)
-
-    stress_test(my_rank, buffer, data_pipe.device)
-
-    buffer.close()
-    data_pipe.close()
-    cpu_pipe.close()
-    print("Done")
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-RANK=0 python3 test_lookup_buffer.py &
-PID0=$!
-RANK=1 python3 test_lookup_buffer.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import subprocess
-import sys
-
-import pytest
-import torch
-
-
-def run_python_script(script_name, timeout):
-    script_name = f"kv_transfer/{script_name}"
-    try:
-        # Start both processes asynchronously using Popen
-        process0 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "0"},  # Set the RANK environment variable for process 0
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        process1 = subprocess.Popen(
-            [sys.executable, script_name],
-            env={"RANK": "1"},  # Set the RANK environment variable for process 1
-            stdout=sys.stdout,  # Pipe stdout to current stdout
-            stderr=sys.stderr,  # Pipe stderr to current stderr
-        )
-
-        # Wait for both processes to complete, with a timeout
-        process0.wait(timeout=timeout)
-        process1.wait(timeout=timeout)
-
-        # Check the return status of both processes
-        if process0.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
-        if process1.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
-
-    except subprocess.TimeoutExpired:
-        # If either process times out, terminate both and fail the test
-        process0.terminate()
-        process1.terminate()
-        pytest.fail(f"Test {script_name} timed out")
-    except Exception as e:
-        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
-
-
-# Define the test cases using pytest's parametrize
-@pytest.mark.parametrize(
-    "script_name,timeout",
-    [
-        ("test_lookup_buffer.py", 60),  # Second test case with a 60-second timeout
-        ("test_send_recv.py", 120),  # First test case with a 120-second timeout
-    ],
-)
-def test_run_python_script(script_name, timeout):
-    # Check the number of GPUs
-    if torch.cuda.device_count() < 2:
-        pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
-
-    # Run the test if there are at least 2 GPUs
-    run_python_script(script_name, timeout)
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import time
-
-import torch
-from tqdm import tqdm
-
-from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
-
-
-def test_run(my_rank, pipe):
-    print(f"rank {my_rank} test_run starts....")
-    # test run
-    x = torch.tensor([1]).to(pipe.device)
-    y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
-    if my_rank == 0:
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-
-    else:
-        x2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received x2 = ", x2)
-        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", y2)
-        pipe.send_tensor(x)
-        print(f"rank {my_rank} sent tensor x")
-        pipe.send_tensor(y)
-        print(f"rank {my_rank} sent tensor y")
-
-    assert torch.allclose(x, x2)
-    assert torch.allclose(y, y2)
-
-    print(f"rank {my_rank} test_run passed!")
-
-
-def stress_test(my_rank, pipe):
-    print(f"rank {my_rank} stress_test starts....")
-
-    tensors: list[torch.Tensor] = []
-
-    torch.distributed.barrier()
-    torch.manual_seed(0)
-
-    for i in tqdm(range(500)):
-        mean = torch.rand(1).item() * 100
-        std = torch.rand(1).item() * 100
-        size = torch.randint(900, 1000, (2,))
-        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
-
-        # 5% probability of sending a None
-        if torch.rand(1).item() < 0.05:
-            tensors.append(None)
-            tensors.append(None)
-            tensors.append(None)
-        else:
-            tensors.append(x)
-            tensors.append(x.mean().unsqueeze(0))
-            tensors.append(x.std().unsqueeze(0))
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3 * i])
-            pipe.send_tensor(tensors[3 * i + 1])
-            pipe.send_tensor(tensors[3 * i + 2])
-        else:
-            x = pipe.recv_tensor()
-            mean = pipe.recv_tensor()
-            std = pipe.recv_tensor()
-
-            if x is None:
-                assert mean is None
-                assert std is None
-            else:
-                assert torch.allclose(x, tensors[3 * i])
-                assert x.mean() == mean[0]
-                assert x.std() == std[0]
-
-        torch.distributed.barrier()
-
-
-def latency_test(my_rank, pipe, nelement, ntensor):
-    latencies = []
-
-    torch.distributed.barrier()
-
-    for i in tqdm(range(500)):
-        tensors = []
-
-        if my_rank == 0:
-            # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-
-        torch.distributed.barrier()
-
-        if my_rank == 0:
-            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
-            for tensor in tensors:
-                pipe.send_tensor(tensor)
-            pipe.send_tensor(t)
-        else:
-            for _ in range(ntensor):
-                pipe.recv_tensor()
-            t = pipe.recv_tensor()
-            latencies.append(time.time() - t.item())
-
-    torch.distributed.barrier()
-
-    print("Latency test passed.")
-    print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
-
-
-if __name__ == "__main__":
-    my_rank = int(os.environ["RANK"])
-
-    torch.distributed.init_process_group(
-        backend="gloo",
-        init_method="tcp://localhost:12398",
-        world_size=2,
-        rank=my_rank,
-    )
-
-    config = KVTransferConfig(
-        kv_connector="P2pNcclConnector",
-        kv_buffer_device="cuda",
-        kv_buffer_size=1e9,
-        kv_rank=my_rank,
-        kv_role="kv_both",  # this arg doesn't matter in this test
-        kv_parallel_size=2,
-        kv_ip="127.0.0.1",
-        kv_port=12345,
-    )
-
-    pipe = PyNcclPipe(
-        local_rank=my_rank,
-        config=config,
-    )
-
-    test_run(my_rank, pipe)
-
-    stress_test(my_rank, pipe)
-
-    # Use this function if you want to test the latency of pipe impl.
-    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
--- a/tests/kv_transfer/test_send_recv.sh
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-RANK=0 python3 test_send_recv.py &
-PID0=$!
-RANK=1 python3 test_send_recv.py &
-PID1=$!
-
-wait $PID0
-wait $PID1
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/init.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/init.py
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `KVLookupBufferBase` that allows developers to
-think of KV cache operations as inserting new KV cache entries (`insert`)
-into the lookup buffer and querying existing KV caches (`drop_select`)
-from the lookup buffer.
-
-This file also contains a new class `KVStoreBufferBase` that allows developers
-to manage the KVCache buffer as a simple key-value storage buffer with basic
-put/get operations.
-
-These classes above are abstracted behind class `KVCacheBufferBase`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVCacheBufferBase(ABC):
-    """
-    Abstract base class for a KVCache buffer.
-    """
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
-
-        This method is responsible for cleaning up resources related to the
-        KVCache buffer when it is no longer needed.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVLookupBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache lookup buffer.
-
-    This class provides an abstraction for a key-value (KV) cache lookup buffer.
-
-    The key of the lookup buffer:
-    - input_tokens: token IDs of the request
-    - roi: a binary mask on top of input_tokens.
-      - Purpose of roi: Since KV cache may only be available for a subset of
-        tokens in the input (for example, when vLLM is connected to an external
-        KV cache service), roi specifies the subset of tokens that the KV cache
-        is associated with.
-      - NOTE: roi can be further extended to describe which part of KV the
-        current process is holding (each process may only hold a part of KV
-        due to TP and PP). This is not implemented for now.
-
-    The value of the lookup buffer:
-    - key: the key tensor in the KV cache
-    - value: the value tensor in the KV cache
-    - hidden: the final hidden state generated by model forwarding. This allows
-      vLLM to bypass further model forwarding by transmitting the hidden state.
-    """
-
-    @abstractmethod
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        """Insert into the lookup buffer.
-
-        The functionality is similar to the following python statement
-        ```
-        buffer[input_tokens, roi] = [key, value, hidden]
-        ```
-
-        FIXME: in the future, we should only have two arguments, key and value,
-        where key is a tensor dict and value is a tensor dict.
-
-        FIXME: we should transmit both sampler outputs and the hidden states.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-            key (torch.Tensor): The key tensor in the KV cache.
-            value (torch.Tensor): The value tensor in the KV cache.
-            hidden (torch.Tensor): The final hidden state tensor generated
-                                   during model forwarding to bypass model
-                                   forwarding.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        """Select and *drop* KV cache entries from the lookup buffer.
-
-        The functionality is similar to the following python statements
-        ```
-        ret = buffer.pop(input_tokens, roi)
-        return ret
-        ```
-
-        If `input_tokens` and `roi` is `None`, it means selecting any of the
-        KV caches in the buffer, return, and remove it from the buffer, useful
-        when offloading KV cache to KV cache storage service.
-
-        Args:
-            input_tokens (torch.Tensor): token IDs.
-            roi (torch.Tensor): A binary mask on top of the input tokens
-
-        Returns:
-            list[Optional[torch.Tensor]]: A list of tensors. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-
-class KVStoreBufferBase(KVCacheBufferBase):
-    """
-    Abstract base class for a KVCache storage buffer with key-value semantics.
-    This class provides a simple key-value storage buffer abstract with basic
-    put/get operations, which enables flexible KVCache transfer granular
-    control.
-
-    The functionality is similar to a distributed key-value store, where:
-    - Key: A unique string identifier for the cached entry
-    - Value:
-        - Tensor to be stored and retrieved
-        - None (indicating deletion or empty value)
-    """
-
-    @abstractmethod
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        """Store a key-value pair in the buffer.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-            value (Optional[torch.Tensor]): Tensor to be stored.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Retrieve a value from the buffer by key.
-
-        Args:
-            key (str): Unique identifier for a tensor, this tensor could be the
-                key cache tensor, value cache tensor, or hidden state tensor
-                generated during model forwarding.
-
-        Returns:
-            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -1,164 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains a new class `MooncakeStore` that allows developers to
-think of KV cache transfer operations as putting new KV cache entries
-into a remote KVStore-based lookup buffer and getting existing KV caches
-from this remote lookup buffer.
-"""
-
-import json
-import os
-from dataclasses import dataclass
-
-import torch
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
-from vllm.logger import init_logger
-
-DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
-DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class MooncakeStoreConfig:
-    local_hostname: str
-    metadata_server: str
-    global_segment_size: int
-    local_buffer_size: int
-    protocol: str
-    device_name: str
-    master_server_address: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeStoreConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeStoreConfig(
-            local_hostname=config.get("local_hostname"),
-            metadata_server=config.get("metadata_server"),
-            global_segment_size=config.get(
-                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
-            ),
-            local_buffer_size=config.get(
-                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
-            ),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-            master_server_address=config.get("master_server_address"),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeStoreConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeStoreConfig.from_file(config_file_path)
-
-
-class MooncakeStore(KVStoreBufferBase):
-    def __init__(
-        self,
-        config: VllmConfig,
-    ):
-        try:
-            from mooncake.store import MooncakeDistributedStore
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        try:
-            self.store = MooncakeDistributedStore()
-            self.config = MooncakeStoreConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-
-            self.store.setup(
-                self.config.local_hostname,
-                self.config.metadata_server,
-                self.config.global_segment_size,
-                self.config.local_buffer_size,
-                self.config.protocol,
-                self.config.device_name,
-                self.config.master_server_address,
-            )
-
-        except ValueError as e:
-            logger.error("Configuration loading failed: %s", e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-
-    def close(self):
-        # MooncakeDistributedStore will automatically call the destructor, so
-        # it is unnecessary to close it manually.
-        pass
-
-    def put(
-        self,
-        key: str,
-        value: torch.Tensor | None,
-    ) -> None:
-        # A message queue needs to be introduced before making it asynchronous.
-        if value is not None:
-            self._put_impl(key, value)
-
-    def get(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        # A message queue needs to be introduced before making it asynchronous.
-        value = self._get_impl(key)
-        return value
-
-    def _put_impl(
-        self,
-        key: str,
-        value: torch.Tensor,
-    ) -> None:
-        """Put KVCache to Mooncake Store"""
-        device_id = value.device.index if value.device.type == "cuda" else -1
-        device_tensor = torch.tensor(device_id, dtype=torch.int32)
-        value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
-        try:
-            self.store.put(key, value_bytes)
-        except TypeError as err:
-            logger.error("Failed to put value into Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Put Type Error.") from err
-
-    def _get_impl(
-        self,
-        key: str,
-    ) -> torch.Tensor | None:
-        """Get KVCache from Mooncake Store"""
-        try:
-            data = self.store.get(key)
-        except TypeError as err:
-            logger.error("Failed to get value from Mooncake Store: %s", err)
-            raise TypeError("Mooncake Store Get Type Error.") from err
-
-        if data:
-            loaded_tensors = safetensors_load(data)
-            tensor = loaded_tensors["tensor"]
-            device_id_tensor = loaded_tensors["device_id"]
-            device_id = int(device_id_tensor.item())
-            device = (
-                torch.device("cuda", device_id)
-                if device_id >= 0
-                else torch.device("cpu")
-            )
-            return tensor.to(device)
-
-        return None
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,242 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Implements a distributed key-value (KV) cache transfer mechanism.
-
-Key Features:
- Distributed KV cache transmission using PyNccl pipes.
- Non-blocking `insert`, blocking `drop_select`.
- Use CPU signal pipe to avoid racing condition
- Handles buffer size constraints and provide backpressure mechanism to
-  stop the prefill instance when the decode instance is slow.
-"""
-
-import threading
-from collections import deque
-
-import torch
-
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class SimpleBuffer(KVLookupBufferBase):
-    def __init__(
-        self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
-    ):
-        """
-        signal_pipe: on CPU
-
-        NOTE: on-device recv will block all threads in the process, making the
-        KV cache producer unable to listen to new request while transmitting
-        KV cache. Luckily CPU recv only blocks the current thread so we use
-        CPU recv to listen to new request.
-
-        data_pipe: on device (e.g. GPU)
-        """
-
-        self.buffer: deque[list[torch.Tensor]] = deque()
-
-        self.buffer_size = 0
-        self.buffer_size_threshold = buffer_size_thresh
-        self.buffer_cv = threading.Condition()
-        self.signal_pipe = signal_pipe
-        self.data_pipe = data_pipe
-        self.request_handling_thread: threading.Thread | None = None
-
-        self.normal_signal = torch.tensor([0], device="cpu")
-        self.end_signal = None
-
-    def _matches(
-        self,
-        tokens_roi_sender: list[torch.Tensor],
-        tokens_roi_recver: list[torch.Tensor],
-    ):
-        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
-        # tokens_roi_recver: tokens and roi of the consumer (query)
-
-        tokens_sender = tokens_roi_sender[0]
-        tokens_recver = tokens_roi_recver[0]
-        roi_sender = tokens_roi_sender[1]
-        roi_recver = tokens_roi_recver[1]
-
-        if tokens_recver is None:
-            # consumer sends an empty request
-            # semantics: DROP SELECT * LIMIT 1
-            # so any of the data in the buffer can be drop-selected
-            return True
-
-        # Assuming that roi is a binary mask on tokens
-        tokens_sender = tokens_sender[roi_sender]
-        tokens_recver = tokens_recver[roi_recver]
-
-        # simple common prefix matching
-        min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
-            return min_length
-
-        return 0
-
-    def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
-        assert tensor is not None, "Use self.data_pipe.send(None) instead"
-        self.buffer_size -= tensor.element_size() * tensor.numel()
-        if tensor.dtype == torch.bool:
-            tensor = tensor.float()
-        self.data_pipe.send_tensor(tensor)
-
-    def _get_element_size(self, data: list | torch.Tensor | None):
-        if isinstance(data, torch.Tensor):
-            return data.element_size() * data.numel()
-        if not data:
-            # cannot perform `not data` on a tensor
-            # so this check needs to go after the check above
-            return 0
-
-        raise AssertionError(f"Unknown data type {type(data)}")
-
-    def _add_to_buffer(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ):
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone()
-        if isinstance(key, torch.Tensor):
-            key = key.clone()
-        if isinstance(value, torch.Tensor):
-            value = value.clone()
-        if isinstance(hidden, torch.Tensor):
-            hidden = hidden.clone()
-
-        buffer_item = [input_tokens, roi, key, value, hidden]
-        data_size = sum([self._get_element_size(data) for data in buffer_item])
-
-        with self.buffer_cv:
-            if self.buffer_size + data_size > self.buffer_size_threshold:
-                # log outside the while loop to avoid this message being logged
-                # repeatedly.
-                logger.debug("KV transfer buffer is full. Handling...")
-                while self.buffer_size + data_size > self.buffer_size_threshold:
-                    self.buffer_cv.wait()
-
-            self.buffer_size += data_size
-            self.buffer.append(buffer_item)
-            self.buffer_cv.notify()
-
-    def _is_end_signal(self, signal):
-        return signal is None
-
-    def drop_select_handler(self):
-        try:
-            while True:
-                signal = self.signal_pipe.recv_tensor()
-                if self._is_end_signal(signal):
-                    logger.info("Received end signal!")
-                    break
-
-                input_tokens = self.data_pipe.recv_tensor()
-
-                roi = self.data_pipe.recv_tensor()
-                assert roi is not None, (
-                    "Please provide the roi when sending drop-select request"
-                )
-                roi = roi > 0.5
-                tokens_roi_recver = [input_tokens, roi]
-
-                def is_buffer_available(
-                    tokens_roi_recver: list[torch.Tensor],
-                ) -> bool:
-                    # perform input tokens and roi matching
-                    # FIXME: this matching is O(n), ideally it should be O(1)
-                    # but this buffer size won't (and shouldn't) be too large so
-                    # the fix is not urgent.
-                    for _ in range(len(self.buffer)):
-                        if self._matches(self.buffer[0], tokens_roi_recver) > 0:
-                            return True
-                        # rotate the element we just accessed to the end
-                        self.buffer.rotate(-1)
-                    return False
-
-                with self.buffer_cv:
-                    while not is_buffer_available(tokens_roi_recver):
-                        logger.debug("KV transfer buffer is not available. Waiting...")
-                        self.buffer_cv.wait()
-                    # need to clone the tensor
-                    # in case the tensor is freed before sending finishes
-                    matched_item = self.buffer.popleft()
-                    for tensor in matched_item:
-                        self._send_tensor_and_dec_size(tensor)
-                    self.buffer_cv.notify()
-
-        except RuntimeError as e:
-            if "Connection closed by peer" not in str(e):
-                raise e
-
-        logger.debug("Closing drop_select_handler")
-
-    def drop_select(
-        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
-    ) -> list[torch.Tensor | None]:
-        assert self.request_handling_thread is None, (
-            "drop_select should be called by the KV cache consumer "
-            "(e.g. the decode vLLM instance)"
-        )
-
-        if isinstance(input_tokens, torch.Tensor):
-            input_tokens = input_tokens.clone()
-        if isinstance(roi, torch.Tensor):
-            roi = roi.clone().float()
-
-        self.signal_pipe.send_tensor(self.normal_signal)
-        self.data_pipe.send_tensor(input_tokens)
-        self.data_pipe.send_tensor(roi)
-
-        input_tokens = self.data_pipe.recv_tensor()
-        roi = self.data_pipe.recv_tensor()
-        if roi is not None:
-            # convert from float tensor to bool tensor
-            # as PyNccl does not support sending bool tensor
-            roi = roi > 0.5
-        key = self.data_pipe.recv_tensor()
-        value = self.data_pipe.recv_tensor()
-        hidden = self.data_pipe.recv_tensor()
-
-        return [input_tokens, roi, key, value, hidden]
-
-    def insert(
-        self,
-        input_tokens: torch.Tensor,
-        roi: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        hidden: torch.Tensor,
-    ) -> None:
-        self._add_to_buffer(input_tokens, roi, key, value, hidden)
-
-        # when calling the insert, the current process is a sender
-        # need to launch the request handler and start listening to request.
-        if self.request_handling_thread is None:
-            self.request_handling_thread = threading.Thread(
-                target=self.drop_select_handler
-            )
-            self.request_handling_thread.start()
-
-    def close(self):
-        if (
-            hasattr(self, "request_handling_thread")
-            and self.request_handling_thread is not None
-        ):
-            self.request_handling_thread.join()
-
-        else:
-            # TODO: have a explicit close signal and have a explicit way to
-            # check if it's requester
-            self.signal_pipe.send_tensor(self.end_signal)
--- a/vllm/distributed/kv_transfer/kv_pipe/init.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/init.py
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,66 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file defines an interface `KVPipeBase`
-that provides an abstraction for sending and receiving tensors, or None, via
-distributed communications.
-
-All classes instantiated from this interface are assumed to be a FIFO pipe.
-
-If your distributed communication platform already supports key-value lookup,
-you can bypass this interface and directly start from `kv_lookup_buffer`.
-"""
-
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class KVPipeBase(ABC):
-    """
-    This class provides an interface for sending and receiving tensors, or
-    None, by distributed communications.
-    """
-
-    @abstractmethod
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send a tensor, or None, via the pipe.
-
-        Need to support sending None -- important for error handling.
-
-        TODO: add a `key` argument so that we can use traditional
-        key-value database as the distributed communication mechanism behind
-        the pipe.
-
-        Args:
-            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive a tensor (can be None) from the pipeline.
-
-        Returns:
-            Optional[torch.Tensor]: The tensor received from the pipeline. Can
-                                    be None.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the pipeline and release resources.
-
-        This method is responsible for closing the communication pipeline
-        and releasing any resources associated with it.
-
-        Raises:
-            NotImplementedError: This method must be implemented in subclasses.
-        """
-        raise NotImplementedError
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,295 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import struct
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-
-import torch
-import zmq
-from safetensors.torch import load as safetensors_load
-from safetensors.torch import save as safetensors_save
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.logger import init_logger
-from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
-
-logger = init_logger(__name__)
-NONE_INT = -150886311
-
-
-@dataclass
-class MooncakeTransferEngineConfig:
-    prefill_url: str
-    decode_url: str
-    metadata_backend: str | None
-    metadata_server: str
-    protocol: str
-    device_name: str
-
-    @staticmethod
-    def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
-        """Load the config from a JSON file."""
-        with open(file_path) as fin:
-            config = json.load(fin)
-        return MooncakeTransferEngineConfig(
-            prefill_url=config.get("prefill_url"),
-            decode_url=config.get("decode_url"),
-            metadata_backend=config.get("metadata_backend", None),
-            metadata_server=config.get("metadata_server"),
-            protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", ""),
-        )
-
-    @staticmethod
-    def load_from_env() -> "MooncakeTransferEngineConfig":
-        """Load config from a file specified in the environment variable."""
-        config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if config_file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        return MooncakeTransferEngineConfig.from_file(config_file_path)
-
-
-class MooncakeTransferEngine:
-    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
-
-    def __init__(self, kv_rank: int, local_rank: int):
-        try:
-            from mooncake.engine import TransferEngine
-        except ImportError as e:
-            raise ImportError(
-                "Please install mooncake by following the instructions at "
-                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
-                "to run vLLM with MooncakeConnector."
-            ) from e
-
-        self.engine = TransferEngine()
-        self.local_rank = local_rank
-
-        try:
-            self.config = MooncakeTransferEngineConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded successfully.")
-        except ValueError as e:
-            logger.error(e)
-            raise
-        except Exception as exc:
-            logger.error("An error occurred while loading the configuration: %s", exc)
-            raise
-        prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
-        decode_host, base_decode_port = split_host_port(self.config.decode_url)
-
-        # Avoid ports conflict when running prefill and decode on the same node
-        if prefill_host == decode_host and base_prefill_port == base_decode_port:
-            base_decode_port = base_decode_port + 100
-
-        prefill_port = base_prefill_port + self.local_rank
-        decode_port = base_decode_port + self.local_rank
-        self.prefill_url = join_host_port(prefill_host, prefill_port)
-        self.decode_url = join_host_port(decode_host, decode_port)
-
-        self.initialize(
-            self.prefill_url if kv_rank == 0 else self.decode_url,
-            self.config.metadata_server,
-            self.config.protocol,
-            self.config.device_name,
-            self.config.metadata_backend,
-        )
-
-        self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
-
-        # Initialize ZeroMQ context and sockets
-        self.context = zmq.Context()  # type: ignore[attr-defined]
-        self.sender_socket = self.context.socket(zmq.constants.PUSH)
-        self.receiver_socket = self.context.socket(zmq.constants.PULL)
-        self.sender_ack = self.context.socket(zmq.constants.PULL)
-        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
-
-        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
-        self._setup_metadata_sockets(
-            kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
-        )
-
-    def _setup_metadata_sockets(
-        self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
-    ) -> None:
-        """Set up ZeroMQ sockets for sending and receiving data."""
-        # Offsets < 8 are left for initialization in case tp and pp are enabled
-        p_rank_offset = p_port + 8 + self.local_rank * 2
-        d_rank_offset = d_port + 8 + self.local_rank * 2
-        if kv_rank == 0:
-            self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", d_host, d_rank_offset + 1)
-            )
-            self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-        else:
-            self.receiver_socket.connect(
-                make_zmq_path("tcp", p_host, p_rank_offset + 1)
-            )
-            self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
-            self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
-            self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
-
-    def initialize(
-        self,
-        local_hostname: str,
-        metadata_server: str,
-        protocol: str,
-        device_name: str,
-        metadata_backend: str | None,
-    ) -> None:
-        """Initialize the mooncake instance."""
-        if metadata_backend is None:
-            self.engine.initialize(
-                local_hostname, metadata_server, protocol, device_name
-            )
-        else:
-            supported_backend = ["etcd", "redis"]
-            metadata_backend = metadata_backend.lower()
-            if metadata_backend not in supported_backend:
-                raise ValueError(
-                    "Mooncake Configuration error. `metadata_backend`"
-                    f" should be one of {supported_backend}."
-                )
-
-            self.engine.initialize_ext(
-                local_hostname, metadata_server, protocol, device_name, metadata_backend
-            )
-
-    def allocate_managed_buffer(self, length: int) -> int:
-        """Allocate a managed buffer of the specified length."""
-        ret = self.engine.allocate_managed_buffer(length)
-        if ret <= 0:
-            logger.error("Allocation Return Error")
-            raise Exception("Allocation Return Error")
-        return ret
-
-    def free_managed_buffer(self, buffer: int, length: int) -> int:
-        """Free a previously allocated managed buffer."""
-        return self.engine.free_managed_buffer(buffer, length)
-
-    def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
-        """Synchronously transfer data to the specified address."""
-        ret = self.engine.transfer_sync_read(
-            self.remote_url, buffer, peer_buffer_address, length
-        )
-        if ret < 0:
-            logger.error("Transfer Return Error")
-            raise Exception("Transfer Return Error")
-        return ret
-
-    def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
-        """Write bytes to the allocated buffer."""
-        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
-
-    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
-        """Read bytes from the allocated buffer."""
-        return self.engine.read_bytes_from_buffer(buffer, length)
-
-    def wait_for_ack(self, src_ptr: int, length: int) -> None:
-        """Asynchronously wait for ACK from the receiver."""
-        ack = self.sender_ack.recv()
-        if ack != b"ACK":
-            logger.error("Failed to receive ACK from the receiver")
-
-        self.free_managed_buffer(src_ptr, length)
-
-    def send_bytes(self, user_data: bytes) -> None:
-        """Send bytes to the remote process."""
-        length = len(user_data)
-        src_ptr = self.allocate_managed_buffer(length)
-        self.write_bytes_to_buffer(src_ptr, user_data, length)
-        self.sender_socket.send_multipart(
-            [struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
-        )
-        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
-
-    def recv_bytes(self) -> bytes:
-        """Receive bytes from the remote process."""
-        data = self.receiver_socket.recv_multipart()
-        src_ptr = struct.unpack("!Q", data[0])[0]
-        length = struct.unpack("!Q", data[1])[0]
-        dst_ptr = self.allocate_managed_buffer(length)
-        self.transfer_sync(dst_ptr, src_ptr, length)
-        ret = self.read_bytes_from_buffer(dst_ptr, length)
-
-        # Buffer cleanup
-        self.receiver_ack.send(b"ACK")
-        self.free_managed_buffer(dst_ptr, length)
-
-        return ret
-
-
-class MooncakePipe(KVPipeBase):
-    """MooncakeTransferEngine based Pipe implementation."""
-
-    def __init__(
-        self, local_rank: int, config: KVTransferConfig, device: str | None = None
-    ):
-        """Initialize the mooncake pipe and set related parameters."""
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
-
-    def _select_device(self, device: str) -> torch.device:
-        """Select available device (CUDA or CPU)."""
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def tensor_hash(self, tensor: torch.Tensor) -> int:
-        """Calculate the hash value of the tensor."""
-        return hash(tensor.data_ptr())
-
-    def _send_impl(self, tensor: torch.Tensor) -> None:
-        """Implement the tensor sending logic using safetensors."""
-        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
-
-    def _recv_impl(self) -> torch.Tensor:
-        """Implement the tensor receiving logic using safetensors."""
-        data = self.transfer_engine.recv_bytes()
-        return safetensors_load(data)["tensor"].to(self.device)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """Send tensor to the target process."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = tensor if tensor is not None else self.none_tensor
-        assert len(tensor.shape) > 0
-        self.transport_thread.submit(self._send_impl, tensor)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """Receive tensor from other processes."""
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-        tensor = self.transport_thread.submit(self._recv_impl).result()
-        if tensor.numel() == 1 and tensor.item() == NONE_INT:
-            return None
-        else:
-            return tensor
-
-    def close(self) -> None:
-        """Cleanup logic when closing the pipe."""
-        self.transfer_engine.sender_socket.close()
-        self.transfer_engine.receiver_socket.close()
-        self.transfer_engine.sender_ack.close()
-        self.transfer_engine.receiver_ack.close()
-        self.transfer_engine.context.term()  # Terminate the ZMQ context
-        logger.info("Closed the transfer engine and cleaned up resources.")
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,285 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module implements a PyNccl pipe for sending and receiving
-Optional[torch.Tensor] between distributed ranks with advanced
-communication features.
-
-Key Features:
- Supports sending and receiving tensors with metadata
- Handles both CUDA and CPU device communications
- Implements a non-blocking tensor transfer mechanism
- Manages buffer size and provides backpressure control
- Supports distributed process groups with configurable parameters
-"""
-
-import threading
-import time
-from collections.abc import Callable
-from concurrent.futures import ThreadPoolExecutor
-
-import torch
-
-from vllm.config.kv_transfer import KVTransferConfig
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from vllm.distributed.utils import StatelessProcessGroup
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class BrokenPipeException(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-Metadata = dict[str, torch.Tensor | None]
-
-
-class PyNcclPipe(KVPipeBase):
-    METADATA_LENGTH = 16
-    MAX_TENSOR_DIMENSIONS = 14
-    METADATA_DTYPE = torch.int64
-
-    def __init__(
-        self,
-        local_rank: int,
-        config: KVTransferConfig,
-        device: str | None = None,
-        port_offset: int = 0,
-    ):
-        self.config = config
-        self.local_rank = local_rank
-        self.kv_rank = self.config.kv_rank
-        assert self.kv_rank is not None
-        self.kv_parallel_size = self.config.kv_parallel_size
-        if device is None:
-            self.device = self._select_device(self.config.kv_buffer_device)
-        else:
-            self.device = self._select_device(device)
-
-        # build distributed connection and send/recv implementation
-        store_timeout = self.config.get_from_extra_config("store_timeout", 300)
-        self.group = StatelessProcessGroup.create(
-            host=self.config.kv_ip,
-            port=self.config.kv_port + port_offset,
-            rank=self.kv_rank,
-            world_size=self.kv_parallel_size,
-            store_timeout=store_timeout,
-        )
-        # add a barrier to make sure the connection is initiated properly
-        self.group.barrier()
-        impl = self._get_device_send_recv_impl(self.group)
-        self.device_send_func, self.device_recv_func = impl
-        # set target rank
-        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
-        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
-
-        # transportation-related variables
-        self.transport_thread: ThreadPoolExecutor | None = None
-        self.buffer_size = 0
-        self.buffer_size_lock = threading.Lock()
-        self.buffer_size_thresh = self.config.kv_buffer_size
-
-    def _get_device_send_recv_impl(
-        self, group: StatelessProcessGroup
-    ) -> tuple[
-        Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
-    ]:
-        send: Callable[[torch.Tensor, int], None]
-        recv: Callable[[torch.Tensor, int], None]
-        if self.device.type == "cuda":
-            # use PyNCCL for send / recv
-            comm = PyNcclCommunicator(group, device=self.local_rank)
-            comm.disabled = False
-            send, recv = comm.send, comm.recv  # type: ignore
-        else:
-            # This send / recv implementation here is NOT intended to transfer
-            # KV caches (and should NOT be repurposed to transfer KV caches).
-            # Currently it is only used to transmit control-plane messages
-            # for PyNcclBuffer.
-            send = group.send_obj
-
-            def my_recv(x, src):
-                x[...] = group.recv_obj(src)
-
-            recv = my_recv
-
-        return send, recv
-
-    def _select_device(self, device: str):
-        logger.info("Selecting device: %s", device)
-        if device == "cuda":
-            return torch.device(f"cuda:{self.local_rank}")
-        else:
-            return torch.device("cpu")
-
-    def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
-        """
-        Create the metadata as a dictionary based on the input tensor.
-
-        Args:
-            tensor: The input tensor or None if no tensor is provided.
-
-        Returns:
-            metadata: A dictionary with the following keys:
-                - "dtype": The data type of the tensor or None.
-                - "shape": The shape of the tensor or None.
-        """
-        if tensor is None:
-            return {"dtype": None, "shape": None}
-        else:
-            return {"dtype": tensor.dtype, "shape": tensor.shape}
-
-    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
-        """
-        Create a buffer to receive the tensor based on the provided metadata.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape",
-                describing the tensor's data type and shape.
-
-        Returns:
-            buffer: A tensor of the specified type and shape,
-                allocated on `self.device`.
-        """
-        return torch.empty(
-            metadata["shape"], dtype=metadata["dtype"], device=self.device
-        )
-
-    def _send_metadata(self, metadata: Metadata):
-        """
-        Send the metadata dictionary to the target rank.
-
-        Args:
-            metadata: A dictionary with keys "dtype" and "shape".
-        """
-        self.group.send_obj(metadata, self.target_rank_for_send)
-
-    def _recv_metadata(self) -> Metadata:
-        """
-        Receive the metadata dictionary from the target rank.
-
-        Returns:
-            metadata: A dictionary with keys "dtype" and "shape"
-                describing the tensor.
-        """
-        return self.group.recv_obj(self.target_rank_for_recv)
-
-    def _send_impl(self, tensor: torch.Tensor | None) -> None:
-        """
-        The actual implementation of sending the tensor and its metadata to the
-        target rank.
-
-        Args:
-            tensor: The input tensor to be sent, or `None` if no tensor is
-                being sent.
-        """
-        metadata = self._make_metadata(tensor)
-        self._send_metadata(metadata)
-        if tensor is not None:
-            self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
-
-    def _recv_impl(self) -> torch.Tensor | None:
-        """
-        The actual implementation of receiving a tensor and its metadata from
-        the target rank.
-
-        Returns:
-            buffer: The received tensor, or `None` if no tensor is received.
-        """
-        metadata = self._recv_metadata()
-        if metadata["dtype"] is None:
-            return None
-        buffer = self._prepare_recv_buffer(metadata)
-        self.device_recv_func(buffer, self.target_rank_for_recv)
-
-        return buffer
-
-    def send_tensor_wrapper(
-        self, tensor: torch.Tensor | None, tensor_size: int
-    ) -> None:
-        """
-        Wrapper for _send_impl to handle exceptions and update buffer size.
-        """
-        try:
-            self._send_impl(tensor)
-
-            with self.buffer_size_lock:
-                self.buffer_size -= tensor_size
-        except Exception as e:
-            logger.error(
-                "[rank%d]: Exception when trying to send %s, msg: %s",
-                torch.distributed.get_rank(),
-                str(tensor),
-                str(e),
-            )
-            import traceback
-
-            traceback.print_exc()
-
-    def block_if_full(self):
-        """
-        Block the current thread if the buffer size is larger than the
-        threshold.
-        """
-        while self.buffer_size > self.buffer_size_thresh:
-            logger.debug("KV cache transfer pipe is full. Waiting...")
-            time.sleep(0.05)
-
-    def send_tensor(self, tensor: torch.Tensor | None) -> None:
-        """
-        Sends a tensor and its metadata to the destination rank in a
-        non-blocking way.
-
-        Args:
-            tensor: The tensor to send, or `None` if no tensor is being sent.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        if tensor is not None:
-            tensor_size = tensor.element_size() * tensor.numel()
-        else:
-            tensor_size = 0
-
-        self.block_if_full()
-
-        with self.buffer_size_lock:
-            self.buffer_size += tensor_size
-
-        self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
-
-    def recv_tensor(self) -> torch.Tensor | None:
-        """
-        Receives a tensor and its metadata from the source rank. Blocking call.
-
-        Returns:
-            The received tensor, or `None` if no tensor is received.
-        """
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        future = self.transport_thread.submit(self._recv_impl)
-
-        try:
-            tensor = future.result()
-        except Exception as e:
-            logger.error("Encountering exception in KV receiving thread")
-            logger.error("%s", e)
-            logger.error("My device: %s", self.device)
-            import traceback
-
-            traceback.print_exc()
-            raise e
-
-        return tensor
-
-    def close(self):
-        """
-        Close the pipe and release associated resources.
-        """
-        if hasattr(self, "transport_thread") and self.transport_thread is not None:
-            self.transport_thread.shutdown()