%%capture
# update or install the necessary libraries
!pip install --upgrade openai
!pip install --upgrade langchain
!pip install --upgrade python-dotenv
!pip install --upgrade pypdf
!pip install --upgrade faiss-cpu
chatGPT over your data
Notebook inspired by: - Tutorial: ChatGPT Over Your Data - Build a GitHub Support Bot with GPT3, LangChain, and Python - Meet Bricky - a conversational bot using OpenAI
Installing packages needed.
%%capture
# update or install the necessary libraries
!pip install --upgrade openai
!pip install --upgrade langchain
!pip install --upgrade python-dotenv
!pip install --upgrade pypdf
!pip install --upgrade faiss-cpu
!pip install --upgrade tiktoken
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
import requests
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
import pathlib
import subprocess
import tempfile
import pickle
import openai
import os
import IPython
from dotenv import load_dotenv
load_dotenv()
# API configuration
= os.getenv("OPENAI_API_KEY")
openai.api_key
# for LangChain
"OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY") os.environ[
Loading markdown documents for github using FAISS
def get_github_docs(repo_owner, repo_name):
with tempfile.TemporaryDirectory() as d:
subprocess.check_call(f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
=d,
cwd=True,
shell
)= (
git_sha "git rev-parse HEAD", shell=True, cwd=d)
subprocess.check_output("utf-8")
.decode(
.strip()
)= pathlib.Path(d)
repo_path = list(repo_path.glob("**/*.md")) + list(
markdown_files "**/*.mdx")
repo_path.glob(
)for markdown_file in markdown_files:
with open(markdown_file, "r") as f:
= markdown_file.relative_to(repo_path)
relative_path = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
github_url yield Document(page_content=f.read(), metadata={"source": github_url})
def source_docs():
#return list(get_github_docs("dagster-io", "dagster"))
#Sagemaker docs: awsdocs, amazon-sagemaker-developer-guide
return list(get_github_docs("awsdocs", "amazon-sagemaker-developer-guide"))
def search_index(source_docs):
= []
source_chunks = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
splitter for source in source_docs:
for chunk in splitter.split_text(source.page_content):
=chunk, metadata=source.metadata))
source_chunks.append(Document(page_content
with open("search_index.pickle", "wb") as f:
pickle.dump(FAISS.from_documents(source_chunks, OpenAIEmbeddings()), f)
= load_qa_with_sources_chain(OpenAI(temperature=0)) chain
def print_answer(question):
with open("search_index.pickle", "rb") as f:
= pickle.load(f)
search_index print(
chain(
{"input_documents": search_index.similarity_search(question, k=4),
"question": question,
},=True,
return_only_outputs"output_text"]
)[ )
print(search_index(source_docs()))
Cloning into '.'...
Created a chunk of size 1056, which is longer than the specified 1024
Created a chunk of size 1807, which is longer than the specified 1024
None
#print_answer("who is the lead singer of matchbox 20")
#print_answer("what are the types of sagemaker endpoints?")
"Can I use SageMaker for Training and Inference with Apache Spark?") print_answer(
Yes, you can use SageMaker for Training and Inference with Apache Spark.
SOURCES:
https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/d514c7799d1c934c96e97655b71dbd9cd78cd59b/doc_source/apache-spark.md
https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/d514c7799d1c934c96e97655b71dbd9cd78cd59b/doc_source/how-it-works-prog-model.md
Loading from pdf files using FAISS
from langchain.document_loaders import PyPDFLoader
= "./example_data/2021-sustainability-report-amazon.pdf"
filename = PyPDFLoader(filename)
loader = loader.load_and_split()
pages print(f'PDF contains {len(pages)} pages')
PDF contains 133 pages
def search_index_pdf(source_docs):
= []
source_chunks = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
splitter for source in source_docs:
for chunk in splitter.split_text(source.page_content):
=chunk, metadata=source.metadata))
source_chunks.append(Document(page_content
with open("search_index.pickle", "wb") as f:
pickle.dump(FAISS.from_documents(source_chunks, OpenAIEmbeddings()), f)
print(search_index_pdf(pages))
None
"When is Amazon net-zero carbon?") print_answer(
Amazon is aiming to reach net-zero carbon by 2030.
SOURCES: 2021-sustainability-report-amazon.pdf