A chatbot is a software application designed to conduct online chat conversations via text or text-to-speech, simulating how a human would behave as a conversational partner. While some chatbots are simple and based on pre-programmed responses, more advanced chatbots utilize artificial intelligence (AI) technologies like natural language processing (NLP) to understand context, manage dialogues, and provide more personalized and accurate responses.
In this tutorial, I am working with a PDF file for chatting. To develop the application, I am using Flask Framework, OpenAI, Transformers, MongoDB, and other technologies.
Step I: Set Up Your Development Environment
- Install Required Libraries: Ensure you have
flask
,requests
, andpymongo
installed:bash
pip install flask requests pymongo
2. Obtain API Key: Obtain an API key from OpenAI to use their API for GPT-3. Follow the instructions on the OpenAI website to sign up and get access.
Step II: Initialize Your Project
- Project Structure: Set up your project structure:
pdf_chatbot/
├── app.py
├── chatbot.py
├── config.py
├── requirements.txt
├── static/
└── templates/
└── index.html
2. Configure API Key: Store your API key securely. You can use environment variables or a configuration file (config.py
).
import os
import pymongo
from pymongo import MongoClient
import urllib.parse
class Config:
username1 = urllib.parse.quote_plus(your user name)
password1 = urllib.parse.quote_plus(your password)
MONGO_URI = f'mongodb://{username1}:{password1}@host_name/?authSource=admin'
SECRET_KEY = os.environ.get('Your Openai Key') or 'Your Openai Key'
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
To ensure that your project is easily reproducible and maintainable, it’s a good practice to create a requirements. txt file. This file lists all the dependencies your project needs, making it easier for others to install and run your code. (requirements. txt)
Flask==2.1.1
pymongo==4.0.1
transformers==4.27.1
openai==0.8.3
scikit-learn==1.0.2
python-dotenv==0.20.0
Step III: Develop the Chatbot Logic
Create Chatbot Logic (chatbot.py): Implement the logic to handle PDF content, chat history, feedback, follow up questions, related responses and comments.
import fitz # PyMuPDF
from sentence_transformers import SentenceTransformer
import pymongo
from pymongo import MongoClient
import urllib.parse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import openai
import os
from datetime import datetime, timedelta
from pymongo import DESCENDING
from rapidfuzz import fuzz
from config import Config
# Set the TOKENIZERS_PARALLELISM environment variable
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set up OpenAI API key
openai.api_key = Config.SECRET_KEY
chat_history = []
class Chatbot:
def __init__(self):
self.chat_history = []
self.client = MongoClient(Config.MONGO_URI)
self.db = self.client['pdf_database']
self.collection = self.db['pdf_chunks']
self.feedback_collection = self.db["feedback"]
self.comment_collection = self.db["comments"]
self.chat_collection = self.db['chat_history']
self.follow_up_collection = self.db['follow_up_phrases']
self.documents = []
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
def read_pdf(self, file_path):
doc = fitz.open(file_path)
text_chunks = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text("text")
sentences = text.split('. ')
text_chunks.extend(sentences)
self.documents.extend(text_chunks)
return text_chunks
def store_embeddings_in_mongo(self, text_chunks):
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
for chunk in text_chunks:
if chunk.strip(): # Check if chunk is non-empty
embedding = embedding_model.encode(chunk)
document = {
"text": chunk,
"embedding": embedding.tolist()
}
self.collection.insert_one(document)
else:
print(f"Skipping empty chunk: {chunk[:30]}...")
def generate_query_embedding(self, query):
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = embedding_model.encode([query])
return query_embedding
def find_similar_documents(self, query_embedding, threshold, top_k=5):
if query_embedding is None:
return []
cursor = self.collection.find({})
documents = []
for doc in cursor:
documents.append({
"text": doc["text"],
"embedding": np.array(doc["embedding"])
})
embeddings = np.array([doc["embedding"] for doc in documents])
cos_sim = cosine_similarity(query_embedding, embeddings)[0]
top_k_indices = np.argsort(cos_sim)[-top_k:][::-1]
top_documents = [documents[i] for i in top_k_indices if cos_sim[i] >= threshold]
return top_documents
def fetch_related_response(self, query_embedding, threshold=0.7):
if query_embedding is None:
return None
chat_history = list(self.chat_collection.find().sort("timestamp", pymongo.ASCENDING))
chat_responses = [entry['response'] for entry in chat_history]
if not chat_responses:
return None
chat_embeddings = self.embedding_model.encode(chat_responses)
similarities = cosine_similarity(query_embedding, chat_embeddings)
max_similarity = np.max(similarities)
most_similar_index = np.argmax(similarities)
if max_similarity >= threshold:
return chat_history[most_similar_index]['response']
else:
return None
def calculate_new_threshold(self, feedback_data):
# Custom logic to calculate a new threshold based on feedback data
if not feedback_data:
return 0.5 # Default threshold
average_rating = sum(fb['rating'] for fb in feedback_data) / len(feedback_data)
new_threshold = 0.5 + (average_rating - 3) * 0.1 # Example adjustment
return new_threshold
def adjust_prompt_based_on_query_feedback(self, query, context, feedback_data):
average_rating, positive_feedback_count, negative_feedback_count = self.analyze_feedback_for_query(query, feedback_data)
if average_rating is None: # No feedback available
specific_prompt = f"User Query: {query}\nContext: {context}\nResponse:"
elif average_rating >= 4:
specific_prompt = f"User Query: {query}\nContext: {context}\nGiven positive feedback for this query, provide a creative and detailed response."
elif average_rating <= 2:
specific_prompt = f"User Query: {query}\nContext: {context}\nGiven negative feedback for this query, provide a more context-rich and clear response."
else:
specific_prompt = f"User Query: {query}\nContext: {context}\nResponse:"
return specific_prompt
def adjust_prompt_based_on_overall_feedback(self, context, feedback_data):
average_rating, positive_feedback_count, negative_feedback_count = self.analyze_feedback(feedback_data)
if average_rating is None:
overall_prompt = f"Context: {context}\nResponse:"
elif average_rating >= 4:
overall_prompt = f"Context: {context}\nGiven overall positive feedback, provide a creative and detailed response."
elif average_rating <= 2:
overall_prompt = f"Context: {context}\nGiven overall negative feedback, provide a more context-rich and clear response."
else:
overall_prompt = f"Context: {context}\nResponse:"
return overall_prompt
def combine_prompts(self, specific_prompt, overall_prompt):
combined_prompt = f"{specific_prompt}\n\n{overall_prompt}\n\n"
return combined_prompt
def generate_response(self, user_query, top_documents, feedback_data):
query_embedding = self.generate_query_embedding(user_query)
context = " ".join([doc["text"] for doc in self.find_similar_documents(query_embedding, 0.5, 5)])
specific_prompt = f"User Query: {user_query}\nContext: {context}\nResponse:"
overall_prompt = self.adjust_prompt_based_on_overall_feedback(context, feedback_data)
combined_prompt = self.combine_prompts(specific_prompt, overall_prompt)
context = " ".join([doc["text"] for doc in top_documents])
specific_prompt = self.adjust_prompt_based_on_query_feedback(user_query, context, feedback_data)
overall_prompt = self.adjust_prompt_based_on_overall_feedback(context, feedback_data)
# Adding tone and style instructions
tone_style_instruction = f"The response should be in a tone like neutral, style like informative"
instruction = f"Ensure the response starts with a strong and relevant opening sentence but not include in response like Opening Sentence:.\n\n"
combined_prompt = self.combine_prompts(specific_prompt, overall_prompt)
final_prompt = f"{combined_prompt}\n\n{tone_style_instruction}\n\n{instruction}"
response = openai.Completion.create(
engine="gpt-3.5-turbo-instruct",
prompt=final_prompt,
max_tokens=500
)
return response.choices[0].text.strip()
def generate_digital_marketing_response(self, query):
prompt = f"User Query: {query}\nGenerate a detailed and informative response related to digital marketing."
response = openai.Completion.create(
engine="gpt-3.5-turbo-instruct",
prompt=prompt,
max_tokens=150
)
return response.choices[0].text.strip()
def collect_user_feedback(self, query, response, rating):
feedback_data = {
"query": query,
"response": response,
"rating": int(rating),
"timestamp": datetime.now()
}
self.feedback_collection.insert_one(feedback_data)
return 'Feedback is added'
def collect_user_comment(self, comment):
comment_data = {
"comment": comment,
"timestamp": datetime.now()
}
self.comment_collection.insert_one(comment_data)
print(f"Collected Comment: {comment_data}")
def analyze_feedback(self, feedback_data=None):
feedbacks = feedback_data or list(self.feedback_collection.find())
positive_feedback = [fb for fb in feedbacks if fb['rating'] >= 4]
negative_feedback = [fb for fb in feedbacks if fb['rating'] <= 2]
average_rating = sum(fb['rating'] for fb in feedbacks) / len(feedbacks) if feedbacks else None
print(f"Positive Feedback Count: {len(positive_feedback)}")
print(f"Negative Feedback Count: {len(negative_feedback)}")
return average_rating, len(positive_feedback), len(negative_feedback)
def analyze_feedback_for_query(self, query, feedback_data=None):
feedbacks = feedback_data or list(self.feedback_collection.find({"query": query, "timestamp": {"$gte": datetime.now() - timedelta(days=30)}}))
if not feedbacks:
return None, 0, 0
positive_feedback = [fb for fb in feedbacks if fb['rating'] >= 4]
negative_feedback = [fb for fb in feedbacks if fb['rating'] <= 2]
average_rating = sum(fb['rating'] for fb in feedbacks) / len(feedbacks)
return average_rating, len(positive_feedback), len(negative_feedback)
def get_last_response(self):
last_chat = list(self.chat_collection.find().sort("timestamp", pymongo.DESCENDING).limit(1))
return last_chat[0] if last_chat else None
#last_chat = self.chat_collection.find().sort("timestamp", pymongo.DESCENDING).limit(1)
#return last_chat[0] if last_chat.count() > 0 else None
def get_follow_up_phrases(self):
phrases = self.follow_up_collection.find()
return [phrase['phrase'] for phrase in phrases]
def is_follow_up_query(self, user_input):
follow_up_phrases = self.get_follow_up_phrases()
follow_up_keywords = ["more", "detail", "elaborate", "specifics", "explain", "expand", "further", "additional", "context", "depth", "examples", "information"]
# Check against follow-up phrases with fuzzy matching
for phrase in follow_up_phrases:
if fuzz.partial_ratio(user_input.lower(), phrase.lower()) > 80: # Adjust the threshold as needed
return True
# Check if any follow-up keywords are in the user input
for keyword in follow_up_keywords:
if keyword in user_input.lower():
return True
return False
def handle_followup(self, user_input):
query_embedding = self.generate_query_embedding(user_input)
feedback_data = list(self.feedback_collection.find())
new_threshold = self.calculate_new_threshold(feedback_data)
top_documents = self.find_similar_documents(query_embedding, threshold=new_threshold, top_k=5)
# Check if query is related to chat history
related_response = self.fetch_related_response(query_embedding)
# Check if the query is a follow-up query
follow_up_phrases = self.get_follow_up_phrases()
#follow_up_phrases = ["explain more", "explain in details", "tell me more"]
user_input = user_input.capitalize() # Convert user query to have first char in uppercase and rest in lowercase
#is_follow_up = any(phrase in user_input for phrase in follow_up_phrases)
is_follow_up = self.is_follow_up_query(user_input.capitalize())
print(is_follow_up)
if related_response:
print("related_response")
tone_style_instruction = f"The response should be in a tone like neutral, style like informative"
instruction = f"Ensure the response starts with a strong and relevant opening sentence but not include in response like Opening Sentence:.\n\n"
context = related_response
specific_prompt = f"User Query: {user_input}\nContext: {context}\nResponse:"
final_prompt = f"{specific_prompt}\n\n{tone_style_instruction}\n\n{instruction}"
response = openai.Completion.create(
engine="gpt-3.5-turbo-instruct",
prompt=final_prompt,
max_tokens=500
)
response = response.choices[0].text.strip()
elif is_follow_up:
last_response = self.get_last_response()
# Adding tone and style instructions
tone_style_instruction = f"The response should be in a tone like neutral, style like informative"
instruction = f"Ensure the response starts with a strong and relevant opening sentence but not include in response like Opening Sentence:.\n\n"
context = last_response["response"] if last_response else ""
specific_prompt = f"User Query: {user_input}\nContext: {context}\nResponse:"
final_prompt = f"{specific_prompt}\n\n{tone_style_instruction}\n\n{instruction}"
response = openai.Completion.create(
engine="gpt-3.5-turbo-instruct",
prompt=final_prompt,
max_tokens=500
)
response = response.choices[0].text.strip()
#response = self.generate_response(user_input, context, feedback_data)
elif top_documents:
#response = self.generate_digital_marketing_response(user_input)
response = self.generate_response(user_input, top_documents, feedback_data)
else:
response = 'There is no matching documents.'
self.chat_history.append({"query": user_input, "response": response, "context": response})
return response
Step IV: Set Up the Web Application
Create Flask Application (app.py): Set up a Flask web application to handle file uploads, chat, feedback, related queries, follow up queries and comments.
from flask import Flask, render_template, request, redirect, url_for, jsonify
from werkzeug.utils import secure_filename
from chatbot import Chatbot # Assuming chatbot functionality is in chatbot.py
import os
from datetime import datetime, timedelta
import openai
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pymongo
from pymongo import MongoClient
import urllib.parse
from rapidfuzz import fuzz
from config import Config
openai.api_key = Config.SECRET_KEY
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = './uploads' # Folder where uploads will be stored
app.config['ALLOWED_EXTENSIONS'] = {'pdf'} # Allowed file extensions for uploads
# For simplicity, use a list to store chat history
chat_history = []
# Initialize SentenceTransformer model globally
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Connect to MongoDB
client = MongoClient(Config.MONGO_URI)
db = client['pdf_database']
chat_collection = db['chat_history']
chat_feedback = db['feedback']
chatbot_instance = Chatbot() # Initialize Chatbot instance
def is_related_to_chat_history(query_embedding, chat_history, threshold=0.7):
if not chat_history:
return False, None
chat_responses = [entry['response'] for entry in chat_history]
chat_embeddings = embedding_model.encode(chat_responses)
similarities = cosine_similarity(query_embedding, chat_embeddings)
max_similarity = np.max(similarities)
most_similar_index = np.argmax(similarities)
if max_similarity >= threshold:
return True, chat_history[most_similar_index]['response']
else:
return False, None
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
@app.route('/')
def index():
chat_history = list(chat_collection.find().sort("timestamp", pymongo.ASCENDING))
return render_template('index.html', chat_history=chat_history)
@app.route('/upload', methods=['GET', 'POST'])
def upload_file():
if request.method == 'POST':
# Check if the post request has the file part
if 'file' not in request.files:
return redirect(request.url)
file = request.files['file']
# If user does not select file, browser also submit an empty part without filename
if file.filename == '':
return redirect(request.url)
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(file_path)
text_chunks = chatbot_instance.read_pdf(file_path)
chatbot_instance.store_embeddings_in_mongo(text_chunks)
return redirect(url_for('index')) # Redirect to main page after upload
return render_template('upload.html')
@app.route('/query', methods=['POST'])
def process_query():
if request.method == 'POST':
query = request.form['query']
query_embedding = chatbot_instance.generate_query_embedding(query)
# Check if query is related to chat history
#is_related, related_response = is_related_to_chat_history(query_embedding, chat_history)
#if is_related:
#response = related_response
# Generate response based on related chat history
#prompt = f"Based on the previous response: {related_response}\n{query}"
#response = openai.Completion.create(
#engine="gpt-3.5-turbo-instruct",
#prompt=prompt,
#max_tokens=500
#)
#else:
# Generate a new response
response = chatbot_instance.handle_followup(query)
# Process query and generate response (replace with actual logic)
#response = chatbot_instance.handle_followup(query)
chat_entry = {"query": query, "response": response, "timestamp": datetime.now()}
# Store query and response in chat history
chat_history.append(chat_entry)
# Store chat history in MongoDB
chat_collection.insert_one(chat_entry)
# Render index.html with updated chat history
return redirect(url_for('index'))
@app.route('/feedback', methods=['POST'])
def submit_feedback():
data = request.json
query = data.get('query')
response = data.get('response')
feedback = data.get('feedback')
if feedback == 'thumbs_up':
rating = 5
elif feedback == 'thumbs_down':
rating = 1
else:
rating = 0
chatbot_instance.collect_user_feedback(query, response, rating)
return jsonify({"status": "success", "message": "Feedback submitted successfully."})
#return redirect(url_for('index'))
@app.route('/query', methods=['GET'])
def show_query():
return render_template('index.html')
if __name__ == "__main__":
app.run(debug=True,port=8086)
Create HTML Template (templates/index.html): Design an HTML page for file uploads, chatting, feedback, and comments.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Chatbot Query</title>
<style>
body {
font-family: Arial, sans-serif;
display: flex;
margin: 0;
padding: 0;
}
.sidebar {
width: 25%;
padding: 20px;
background-color: #f8f9fa;
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
}
.main-content {
width: 75%;
padding: 20px;
}
.chat-history {
max-width: 600px;
margin: 0 auto 20px auto;
padding: 20px;
background-color: #f0f0f0;
border-radius: 5px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.query-form {
max-width: 600px;
margin: 0 auto;
padding: 20px;
background-color: #f0f0f0;
border-radius: 5px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.query-input {
width: 100%;
padding: 10px;
margin-bottom: 10px;
border: 1px solid #ccc;
border-radius: 5px;
box-sizing: border-box;
}
.submit-button {
padding: 10px 20px;
background-color: #007bff;
color: white;
border: none;
border-radius: 5px;
cursor: pointer;
}
.feedback-buttons {
display: flex;
gap: 10px;
}
.comment-box {
width: 100%;
height: 100px;
margin-top: 10px;
padding: 10px;
border: 1px solid #ccc;
border-radius: 5px;
resize: vertical;
}
</style>
</head>
<body>
<div class="sidebar">
<h2>Upload PDF</h2>
<form action="/upload" method="post" enctype="multipart/form-data">
<input type="file" name="file">
<button type="submit" class="submit-button">Upload</button>
</form>
</div>
<div class="main-content">
<div class="chat-history">
<h2>Chat History</h2>
{% for entry in chat_history %}
<div>
<strong>Query:</strong> {{ entry.query }}<br>
<strong>Response:</strong> {{ entry.response }}<br>
<div class="feedback-buttons">
<button onclick="submitFeedback('{{ entry.query }}','{{ entry.response }}', 'thumbs_up')">👍 </button>
<button onclick="submitFeedback('{{ entry.query }}','{{ entry.response }}', 'thumbs_down')">👎 </button>
</div>
<textarea id="comment_{{ loop.index }}" class="comment-box" placeholder="Enter your comment here"></textarea>
<button onclick="submitComment('{{ entry.query }}', 'comment_{{ loop.index }}')" class="submit-button">Submit Comment</button>
</div>
<hr>
{% endfor %}
</div>
<div class="query-form">
<h1>Chatbot Query</h1>
<form action="/query" method="POST">
<input type="text" name="query" class="query-input" placeholder="Enter your query">
<button type="submit" class="submit-button">Submit Query</button>
</form>
</div>
</div>
<script>
// Function to submit thumbs up or thumbs down feedback
function submitFeedback(query, response, feedback) {
fetch('/feedback', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ query: query, response: response, feedback: feedback })
}).then(response => response.json())
.then(data => {
alert(`Feedback submitted: ${feedback}`);
});
}
// Function to submit comment
function submitComment(query, commentId) {
const comment = document.getElementById(commentId).value;
fetch('/submit_comment', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ query: query, comment: comment })
}).then(response => response.json())
.then(data => {
alert(`Comment submitted: ${comment}`);
});
}
</script>
</body>
</html>