26 lines
551 B
Python
26 lines
551 B
Python
|
|
import os
|
||
|
|
import cohere
|
||
|
|
import torch
|
||
|
|
from transformers import AutoModel, AutoTokenizer
|
||
|
|
|
||
|
|
from datasets import load_dataset
|
||
|
|
|
||
|
|
#initialze COHERE
|
||
|
|
os.environ["COHERE_API_KEY"] = "Ef4JbSQqzi4"
|
||
|
|
co = cohere.Client()
|
||
|
|
|
||
|
|
#initialize E5
|
||
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||
|
|
model_id = "infloat/e5-base-v2"
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||
|
|
model = AutoModel.from_pretrained(model_id).to(device)
|
||
|
|
model.eval()
|
||
|
|
|
||
|
|
|
||
|
|
#load premade dataset from hugging face
|
||
|
|
data = load_dataset(
|
||
|
|
"jamescalam/ai-arxiv-chunked",
|
||
|
|
split="train",
|
||
|
|
)
|
||
|
|
|