A typical RAG pipeline that retrieves documents, filters by relevance, and generates an answer. X-Ray tracks which documents made it to the LLM context.
What to debug
- Which documents were retrieved but filtered out
- Relevance scores and drop reasons
- Prompts sent to the LLM and responses
Code
import xray
xray.init(api_key="your-api-key")
@xray.pipeline("rag-retrieval")
def answer_question(question: str):
xray.tag("question_length", len(question))
docs = retrieve_docs(question)
relevant = filter_relevance(docs, question)
answer = generate_answer(question, relevant)
return answer
@xray.step("RETRIEVE")
def retrieve_docs(question: str):
embedding = embed(question)
xray.metric("embedding_model", "text-embedding-3-small")
results = vector_db.search(embedding, limit=100)
return results
@xray.step("FILTER")
def filter_relevance(docs, question):
for doc in docs:
score = cross_encoder.score(question, doc.text)
xray.score(doc, score)
if score < 0.3:
xray.drop(doc, "low_relevance")
continue
if doc.is_outdated:
xray.drop(doc, "outdated_content")
continue
yield doc
@xray.step("LLM_CALL")
def generate_answer(question: str, docs: list):
context = "\n".join([d.text for d in docs])
prompt = f"Context:\n{context}\n\nQuestion: {question}"
xray.artifact("prompt", prompt)
xray.metric("context_docs", len(docs))
response = openai.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content
xray.artifact("response", answer)
return answer