RAG-AI/SimpleRAG/main.py

import ollama


EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'


#open dataset and return the number of lines of information within the dataset
dataset = []
with open('cat-facts.txt', 'r', encoding='utf-8') as file:
    dataset = file.readlines()
    print(f'loaded {len(dataset)} entries')


VECTOR_DB = []
def add_chunk_to_database(chunk):
    embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
    VECTOR_DB.append((chunk, embedding))

    #go through each line of the dataset as if each line is a chunk
    for i, chunk in enumerate(dataset):
        add_chunk_to_database(chunk)
        print(f'added chunk {i+1} / {len(dataset)} to the database')

#compares similarity between added data and existing data
def cosine_similarity(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)

def retrieve(query, top_n=3):
    query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
    #temporary list to store (chunk, similarity) pairs
    similarities = []
    for chunk, embedding in VECTOR_DB:
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))
    #sort by similarity in descending order, becuase higher similarity means more relevant chunks
    similarities.sort(key=lambda x: x[1], reverse=True)
    #return the top N most chunks
    return similarities[:top_n]

input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
'''

stream = ollama.chat(
  model=LANGUAGE_MODEL,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
simple rag, working on embedding 2025-11-24 15:26:15 -06:00			`import ollama`


			`EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'`
			`LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'`



			`#open dataset and return the number of lines of information within the dataset`
			`dataset = []`
			`with open('cat-facts.txt', 'r', encoding='utf-8') as file:`
			`dataset = file.readlines()`
			`print(f'loaded {len(dataset)} entries')`



			`VECTOR_DB = []`
			`def add_chunk_to_database(chunk):`
			`embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]`
			`VECTOR_DB.append((chunk, embedding))`

			`#go through each line of the dataset as if each line is a chunk`
			`for i, chunk in enumerate(dataset):`
			`add_chunk_to_database(chunk)`
			`print(f'added chunk {i+1} / {len(dataset)} to the database')`

			`#compares similarity between added data and existing data`
			`def cosine_similarity(a, b):`
			`dot_product = sum([x * y for x, y in zip(a, b)])`
			`norm_a = sum([x 2 for x in a]) 0.5`
			`norm_b = sum([x 2 for x in b]) 0.5`
			`return dot_product / (norm_a * norm_b)`

			`def retrieve(query, top_n=3):`
			`query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]`
			`#temporary list to store (chunk, similarity) pairs`
			`similarities = []`
			`for chunk, embedding in VECTOR_DB:`
			`similarity = cosine_similarity(query_embedding, embedding)`
			`similarities.append((chunk, similarity))`
			`#sort by similarity in descending order, becuase higher similarity means more relevant chunks`
			`similarities.sort(key=lambda x: x[1], reverse=True)`
			`#return the top N most chunks`
			`return similarities[:top_n]`

			`input_query = input('Ask me a question: ')`
			`retrieved_knowledge = retrieve(input_query)`

			`print('Retrieved knowledge:')`
			`for chunk, similarity in retrieved_knowledge:`
			`print(f' - (similarity: {similarity:.2f}) {chunk}')`

			`instruction_prompt = f'''You are a helpful chatbot.`
			`Use only the following pieces of context to answer the question. Don't make up any new information:`
			`{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}`
			`'''`

			`stream = ollama.chat(`
			`model=LANGUAGE_MODEL,`
			`messages=[`
			`{'role': 'system', 'content': instruction_prompt},`
			`{'role': 'user', 'content': input_query},`
			`],`
			`stream=True,`
			`)`

			`# print the response from the chatbot in real-time`
			`print('Chatbot response:')`
			`for chunk in stream:`
			`print(chunk['message']['content'], end='', flush=True)`