Snyk has a proof-of-concept or detailed explanation of how to exploit this vulnerability.
The probability is the direct output of the EPSS model, and conveys an overall sense of the threat of exploitation in the wild. The percentile measures the EPSS probability relative to all known EPSS scores. Note: This data is updated daily, relying on the latest available EPSS model version. Check out the EPSS documentation for more details.
In a few clicks we can analyze your entire application and see what components are vulnerable in your application, and suggest you quick fixes.
Test your applicationsUpgrade llama-index
to version 0.12.41 or higher.
llama-index is an Interface between LLMs and your data
Affected versions of this package are vulnerable to Expected Behavior Violation via the DocugamiReader
class. An attacker can cause loss of important document content, disrupt parent-child chunk hierarchies, and lead to inaccurate AI outputs by exploiting hash collisions that result in overwriting structurally distinct document chunks containing identical text.
from hashlib import md5
from dataclasses import dataclass
from typing import Optional, Dict, List
@dataclass
class FakeDocument:
text: str
metadata: Dict[str, str]
excluded_llm_metadata_keys: Optional[List[str]] = None
XPATH_KEY = "xpath"
ID_KEY = "id"
DOCUMENT_NAME_KEY = "name"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
@dataclass
class FakeChunk:
text: str
xpath: str
structure: str
tag: str
parent: Optional['FakeChunk'] = None
document_name = "test_doc"
additional_doc_metadata = None
max_text_length = 4096
include_project_metadata_in_doc_metadata = True
parent_id_key = "doc_id"
def _build_framework_chunk(dg_chunk: FakeChunk) -> FakeDocument:
_hashed_id = md5(dg_chunk.text.encode()).hexdigest()
metadata = {
XPATH_KEY: dg_chunk.xpath,
ID_KEY: _hashed_id,
DOCUMENT_NAME_KEY: document_name,
STRUCTURE_KEY: dg_chunk.structure,
TAG_KEY: dg_chunk.tag,
}
if additional_doc_metadata and include_project_metadata_in_doc_metadata:
metadata.update(additional_doc_metadata)
return FakeDocument(
text=dg_chunk.text[:max_text_length],
metadata=metadata,
excluded_llm_metadata_keys=[XPATH_KEY, ID_KEY, STRUCTURE_KEY],
)
parent_chunk = FakeChunk(
text="Parent clause.",
xpath="/Document/Section/Parent",
structure="section",
tag="section"
)
dg_chunks = [
FakeChunk(
text="This agreement may be terminated at any time.",
xpath="/Document/Section/Clause",
structure="clause",
tag="p",
parent=parent_chunk,
),
FakeChunk(
text="This agreement may be terminated at any time.",
xpath="/Document/Appendix/Note",
structure="note",
tag="p",
parent=parent_chunk,
),
]
framework_chunks: Dict[str, FakeDocument] = {}
for dg_chunk in dg_chunks:
framework_chunk = _build_framework_chunk(dg_chunk)
chunk_id = framework_chunk.metadata.get(ID_KEY)
if chunk_id:
framework_chunks[chunk_id] = framework_chunk
if dg_chunk.parent:
framework_parent_chunk = _build_framework_chunk(dg_chunk.parent)
parent_id = framework_parent_chunk.metadata.get(ID_KEY)
if parent_id and framework_parent_chunk.text:
framework_chunk.metadata[parent_id_key] = parent_id
framework_chunks[parent_id] = framework_parent_chunk
import pandas as pd
df = pd.DataFrame([
{
"id": doc.metadata[ID_KEY],
"text": doc.text,
"xpath": doc.metadata[XPATH_KEY],
"structure": doc.metadata[STRUCTURE_KEY],
"tag": doc.metadata[TAG_KEY],
"is_parent": doc.text == "Parent clause.",
}
for doc in framework_chunks.values()
])
print(df.to_string(index=False))