fixes, login and more

This commit is contained in:
Tobias Weise 2024-06-21 14:56:36 +02:00
parent 357054b5af
commit f6f5138f6e
6 changed files with 509 additions and 97 deletions

View File

@ -7,28 +7,23 @@ OpenAPI access via http://localhost:5000/openapi/ on local docker-compose deploy
#warnings.filterwarnings("ignore")
#std lib modules:
import os, sys, json
import os, sys, json, time
from typing import Any, Tuple, List, Dict, Any, Callable, Optional
from datetime import datetime, date
from collections import namedtuple
import hashlib, traceback, logging
from functools import wraps
#llm
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Ollama
#import openai #even used?
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
#from langchain.callbacks import get_openai_callback
#from langchain_community.callbacks import get_openai_callback
#from langchain_openai import ChatOpenAI, AzureChatOpenAI
#from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
#from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.callbacks.base import BaseCallbackHandler, BaseCallbackManager
@ -36,11 +31,15 @@ from langchain.prompts import PromptTemplate
#ext libs
from elasticsearch import NotFoundError, Elasticsearch # for normal read/write without vectors
from elasticsearch_dsl import Search, A
from elasticsearch_dsl import Document, Date, Integer, Keyword, Float, Long, Text, connections
from elasticsearch_dsl import Search, A, Document, Date, Integer, Keyword, Float, Long, Text, connections
from elasticsearch.exceptions import ConnectionError
from pydantic import BaseModel, Field
import logging_loki
import jwt as pyjwt
#flask, openapi
from flask import Flask, send_from_directory, Response, request, jsonify
import sys, os
@ -50,20 +49,159 @@ from flask_openapi3 import Info, Tag, OpenAPI, Server, FileStorage
from flask_socketio import SocketIO, join_room, leave_room, rooms, send
#home grown
import base64
import os
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
#----------home grown--------------
#from scraper import WebScraper
from funcs import group_by
#from elastictools import update_by_id, delete_by_id
from elastictools import get_by_id, update_by_id, delete_by_id
from models import QueryLog, Chatbot, User
#TODO: implement some kind of logging mechanism
#logging.basicConfig(filename='record.log', level=logging.DEBUG)
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.WARN)
app = Flask(__name__)
from flask_cors import CORS #falls cross-orgin verwendet werden soll
CORS(app)
#LLM_PAYLOAD = int(os.getenv("LLM_PAYLOAD"))
#CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))
BOT_ROOT_PATH = os.getenv("BOT_ROOT_PATH")
# JWT Bearer Sample
jwt = {
"type": "http",
"scheme": "bearer",
"bearerFormat": "JWT"
}
security_schemes = {"jwt": jwt}
security = [{"jwt": []}]
info = Info(
title="Chatbot-API",
version="1.0.0",
summary="The REST-API",
description="Default model: ..."
)
servers = [
Server(url=BOT_ROOT_PATH )
]
class NotFoundResponse(BaseModel):
code: int = Field(-1, description="Status Code")
message: str = Field("Resource not found!", description="Exception Information")
app = OpenAPI(
__name__,
info=info,
servers=servers,
responses={404: NotFoundResponse},
security_schemes=security_schemes
)
def jwt_required(f):
"""
Wraps routes in a jwt-required logic and passes decoded jwt and user from elasticsearch to the route as keyword
"""
@wraps(f)
def decorated_route(*args, **kwargs):
token = None
if "Authorization" in request.headers:
token = request.headers["Authorization"].split(" ")[1]
if not token:
return jsonify({
'status': 'error',
"message": "Authentication Token is missing!",
}), 401
try:
data = pyjwt.decode(token, app.config["jwt_secret"], algorithms=["HS256"])
except Exception as e:
return jsonify({
'status': 'error',
"message": "JWT-decryption: " + str(e)
}), 401
try:
#user = get_by_id(client, index="user", id_field_name="email", id_value=data["email"])[0]
response = Search(using=client, index="user").filter("term", **{"email": data["email"]})[0:5].execute()
for hit in response:
user = hit
break
except Exception as e:
return jsonify({
'status': 'error',
"message": "Invalid Authentication token!"
}), 401
kwargs["decoded_jwt"] = data
kwargs["user"] = user
return f(*args, **kwargs)
return decorated_route
def create_key(salt: str, user_email: str) -> Fernet:
"""
Example salt: 9c46f833b3376c5f3b64d8a93951df4b
Fernet usage: token = f.encrypt(b"Secret message!")
"""
salt_bstr = bytes(salt, "utf-8")
email_bstr = bytes(user_email, "utf-8")
#password = b"password"
#salt = os.urandom(16)
#salt = b"9c46f833b3376c5f3b64d8a93951df4b"
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=32,
salt=salt_bstr,
iterations=48,
)
key = base64.urlsafe_b64encode(kdf.derive(email_bstr))
return Fernet(key)
#app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['CORS_HEADERS'] = 'Content-Type'
app.config['CORS_METHODS'] = ["GET,POST,OPTIONS,DELETE,PUT"]
env_to_conf = {
"BACKEND_INTERNAL_URL": "api_url",
"ELASTIC_URI": "elastic_uri",
"SECRET": "jwt_secret"
}
#import values from env into flask config and do existence check
for env_key, conf_key in env_to_conf.items():
x = os.getenv(env_key)
if not x:
msg = "Environment variable '%s' not set!" % env_key
app.logger.fatal(msg)
sys.exit(1)
else:
app.config[conf_key] = x
#from flask_cors import CORS #falls cross-orgin verwendet werden soll
#CORS(app)
socket = SocketIO(app, cors_allowed_origins="*")
@ -74,7 +212,7 @@ def sockcon(data):
to avoid broadcasting messages
answer in callback only to room with sid
"""
room = request.sid
room = request.sid + request.remote_addr
join_room(room)
socket.emit('backend response', {'msg': f'Connected to room {room} !', "room": room}) # looks like iOS needs an answer
@ -112,16 +250,129 @@ def handle_message(message):
def create_embedding():
pass
def hash_password(s: str) -> str:
return hashlib.md5(s.encode('utf-8')).hexdigest()
#======================= TAGS =============================
jwt_tag = Tag(name='JWT', description='Requires a valid JSON Web Token')
not_implemented_tag = Tag(name='Not implemented', description='Functionality not yet implemented beyond an empty response')
#==============Routes===============
class LoginRequest(BaseModel):
email: str = Field(None, description='A short text by the user explaining the rating.')
password: str = Field(None, description='A short text by the user explaining the rating.')
@app.post('/login', summary="", tags=[], security=security)
def login(form: LoginRequest):
"""
Get your JWT to verify access rights
"""
client = Elasticsearch(app.config['elastic_uri'])
match get_by_id(client, index="user", id_field_name="email", id_value=form.email):
case []:
msg = "User with email '%s' doesn't exist!" % form.email
app.logger.error(msg)
return jsonify({
'status': 'error',
'message': msg
}), 400
case [user]:
if user["password_hash"] == hash_password(form.password):
return pyjwt.encode({"email": form.email}, app.config['jwt_secret'], algorithm="HS256")
else:
msg = "Invalid password!"
app.logger.error(msg)
return jsonify({
'status': 'error',
'message': msg
}), 400
class IndexSchemaRequest(BaseModel):
#end: datetime = Field("2100-01-31T16:47+00:00", description="""The interval end datetime in <a href="https://en.wikipedia.org/wiki/ISO_8601">ISO 8601</a> format""")
pass
@app.get('/bot', summary="", tags=[jwt_tag], security=security)
@jwt_required
def get_all_bots(decoded_jwt, user):
"""
List all bots for a user identified by the JWT.
"""
#client = Elasticsearch(app.config['elastic_uri'])
#bots = get_by_id(client, index="chatbot", id_field_name="createdBy", id_value=nextsearch_user.meta.id)
#return jsonify(bots)
return jsonify([])
@app.post('/bot', summary="", tags=[jwt_tag, not_implemented_tag], security=security)
@jwt_required
def create_bot(query: IndexSchemaRequest):
"""
Creates a chatbot for the JWT associated user.
"""
return ""
#======== DEBUG routes ============
@app.get('/bot/debug/schema', summary="", tags=[])
def get_schema(query: IndexSchemaRequest):
"""
"""
#chatbots = query.chatbots
#client = Elasticsearch(app.config['elastic_uri'])
def simplify_properties(d):
new_d = {}
for field, d3 in d["properties"].items():
if "type" in d3:
new_d[field] = d3["type"]
elif "properties" in d3:
new_d[field] = simplify_properties(d3)
return new_d
def get_type_schema(client: Elasticsearch):
d = client.indices.get(index="*").body
new_d = {}
for index, d2 in d.items():
new_d[index] = simplify_properties(d2["mappings"])
return new_d
return jsonify( get_type_schema(client) )
#TODO: route that takes a schema json and compares to internal structure and returns boolean
#-------- non api routes -------------
@app.route("/") #Index Verzeichnis
def index():
return send_from_directory('.', "index.html")
@app.route("/info") #spezielle Nutzer definierte Route
def info():
return sys.version+" "+os.getcwd()
#@app.route("/info") #spezielle Nutzer definierte Route
#def info():
# return sys.version+" "+os.getcwd()
@app.route('/<path:path>') #generische Route (auch Unterordner)
def catchAll(path):
@ -129,13 +380,78 @@ def catchAll(path):
def init_indicies():
# create the mappings in elasticsearch
for Index in [QueryLog, Chatbot, User]:
Index.init()
def create_default_users():
#create default users
client = Elasticsearch(app.config['elastic_uri'])
default_users = os.getenv("DEFAULT_USERS")
if default_users:
for (email, pwd, role) in json.loads(default_users):
if len(get_by_id(client, index="user", id_field_name="email", id_value=email)) == 0:
user = User(email=email, password_hash=hash_password(pwd), role=role)
#user.published_from = datetime.now()
user.save()
if __name__ == '__main__':
#Wenn HTTPS benötigt wird (Pfade für RHEL7/können je OS variieren)
#cert = "/etc/pki/tls/certs/cert-payment.pem" #cert
#key = "/etc/pki/tls/private/cert-payment-private.pem" #key
#context = (cert, key)
#app.run(debug=True, host='0.0.0.0', ssl_context=context)
app.run(debug=True, host='0.0.0.0')
#app.run(debug=True)
#TODO: implement some kind of logging mechanism
#logging.basicConfig(filename='record.log', level=logging.DEBUG)
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.WARN)
"""
USE_LOKI_LOGGER = os.getenv("USE_LOKI_LOGGER")
if USE_LOKI_LOGGER:
handler = logging_loki.LokiHandler(
url="http://loki:3100/loki/api/v1/push",
tags={"application": "Nextsearch"},
#auth=("username", "password"),
version="1",
)
app.logger.addHandler(handler)
"""
connections.create_connection(hosts=app.config['elastic_uri'])
#client = Elasticsearch(app.config['elastic_uri'])
#client = Elasticsearch(hosts=[{"host": "elasticsearch"}], retry_on_timeout=True)
client = Elasticsearch(app.config['elastic_uri'], retry_on_timeout=True)
#TODO: find a clean way to wait without exceptions!
#Wait for elasticsearch to start up!
i = 1
while True:
try:
#client = Elasticsearch(app.config['elastic_uri'])
client.cluster.health(wait_for_status='yellow')
print("Elasticsearch found! Run Flask-app!", flush=True)
break
except ConnectionError:
i *= 1.5
time.sleep(i)
print("Elasticsearch not found! Wait %s seconds!" % i, flush=True)
# Display cluster health
#app.logger.debug(connections.get_connection().cluster.health())
init_indicies()
create_default_users()
app.run(debug=True, host='0.0.0.0')

View File

@ -0,0 +1,32 @@
ELASTIC_URI=http://elasticsearch:9200
DEFAULT_USERS=[["user@gmail.com", "1234", "user"], ["admin@gmail.com", "1234", "admin"]]
# DEFAULT_USERS is list of lists, each nested list respectively contains email, password and role
# e.g. [["user@gmail.com", "1234", "user"], ["admin@gmail.com", "1234", "admin"]]
# leave empty if you don't wish to seed users
#-----------not used yet----------------
#JWT encryption secret:
SECRET=1234
LLM_PAYLOAD=16384
CHUNK_SIZE=1536
#A flag for using Loki for logging. To deactivate comment it out
USE_LOKI_LOGGER=1
BOT_ROOT_PATH=/
BACKEND_INTERNAL_URL=http://backend:5000

View File

@ -4,8 +4,12 @@ Some helper functions to make querying easier
from typing import Any, Tuple, List, Dict, Any, Callable, Optional
import json
from elasticsearch import NotFoundError, Elasticsearch # for normal read/write without vectors
from elasticsearch_dsl import Search, A
from elasticsearch_dsl import Document, Date, Integer, Keyword, Float, Long, Text, connections
from elasticsearch_dsl import Search, A, UpdateByQuery, Document, Date, Integer, Keyword, Float, Long, Text, connections
def get_by_id(client: Elasticsearch, index: str, id_field_name: str, id_value: str):
response = Search(using=client, index=index).filter("term", **{id_field_name: id_value})[0:10000].execute()
return [hit.to_dict() for hit in response]
def update_by_id(client: Elasticsearch, index: str, id_field_name: str, id_value: str, values_to_set: Dict[str, Any]) -> None:
@ -14,6 +18,7 @@ def update_by_id(client: Elasticsearch, index: str, id_field_name: str, id_value
for k, v in values_to_set.items():
source += f"ctx._source.{k} = {json.dumps(v)};"
"""
body = {
"query": {
"term": {
@ -26,6 +31,14 @@ def update_by_id(client: Elasticsearch, index: str, id_field_name: str, id_value
}
}
client.update_by_query(index=index, body=body)
"""
ubq = UpdateByQuery(using=client, index=index) \
.query("term", **{id_field_name: id_value}) \
.script(source=source, lang="painless")
response = ubq.execute()
return response.success()

View File

@ -1,84 +1,127 @@
import os
from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Keyword, Float, Long, Text, connections, Object
from elasticsearch_dsl import Document, InnerDoc, Nested, Date, Integer, Keyword, Float, Long, Text, connections, Object, Boolean
# Define a default Elasticsearch client
connections.create_connection(hosts="http://localhost:9200")
class Article(Document):
title = Text(analyzer='snowball', fields={'raw': Keyword()})
body = Text(analyzer='snowball')
tags = Keyword()
published_from = Date()
lines = Integer()
class User(Document):
email = Keyword()
password_hash = Text(index=False)
role = Keyword()
#salt = Text(index=False)
#profileImage = Text(index=False)
#profileImage = Keyword()
isEmailVerified = Boolean()
#status = Text()
#otpExpires = Date()
#resetPasswordToken = Text(index=False)
#mailToken = Text(index=False)
class Index:
name = 'blog'
name = 'user'
settings = {
"number_of_shards": 1,
}
def save(self, ** kwargs):
self.lines = len(self.body.split())
return super(Article, self).save(** kwargs)
return super(User, self).save(**kwargs)
#======= nextsearch_log ===========
class Chatbot(Document):
name = Text()
createdBy = Keyword()
description = Text()
systemPrompt = Text(index=False)
#slug = Keyword()
files = Nested()
text = Text()
links = Nested()
#chatbotImage = Text(index=False)
sourceCharacters = Integer()
#visibility = Keyword()
#status = Keyword()
temperature = Float()
llm_model = Keyword()
class Index:
name = 'chatbot'
settings = {
"number_of_shards": 1,
}
def save(self, ** kwargs):
return super(Chatbot, self).save(**kwargs)
#======= Query Log ===========
class Sources(InnerDoc):
score = Float()
sourceFileId = Text()
#sourceFileId = Text()
sourceType = Text()
tags = Text()
class NextsearchLog(Document):
a = Text()
#new fields
sourceFileId = Keyword()
filename = Keyword()
url = Keyword()
txt_id = Keyword()
page = Integer()
class QueryLog(Document):
answer = Text()
question = Text()
chatbotid = Keyword()
durasecs = Float()
inCt = Float()
#inCt = Float()
inToks = Long()
llm = Text()
outCt = Float()
#outCt = Float()
outToks = Long()
q = Text()
queryid = Keyword()
rating = Long()
reason = Text()
reasontags = Text()
#queryid = Keyword()
#rating = Long()
#reason = Text()
#reasontags = Text()
session = Keyword()
sources = Object(Sources) #Text(analyzer='snowball')
sources = Object(Sources)
temperature = Float()
totalCt = Float()
#totalCt = Float()
timest = Date() #timestamp
date = Date() #iso date
class Index:
#name = 'test_nextsearch_log'
name = 'nextsearch_log'
name = 'query_log'
settings = {
"number_of_shards": 1,
}
def save(self, ** kwargs):
self.lines = len(self.body.split())
return super(NextsearchLog, self).save(** kwargs)
return super(QueryLog, self).save(**kwargs)
if __name__ == "__main__":
elastic_uri = os.getenv("ELASTIC_URI")
#elastic_uri = "http://localhost:9200"
assert elastic_uri
# Define a default Elasticsearch client
connections.create_connection(hosts=elastic_uri)
#connections.create_connection(hosts)
# create the mappings in elasticsearch
NextsearchLog.init()
# create the mappings in elasticsearch
#Article.init()
# create and save and article
#article = Article(meta={'id': 42}, title='Hello world!', tags=['test'])

View File

@ -13,3 +13,9 @@ Flask-SocketIO
flask-openapi3
minio
python-logging-loki
pyjwt
cryptography

View File

@ -36,12 +36,14 @@ services:
container_name: ${APP_PREFIX}_elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
restart: always
mem_limit: 4024m
ports:
- "9200:9200"
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- logger.level=ERROR
#- ES_JAVA_OPTS="-Xms2g -Xmx2g"
volumes:
- esdata:/usr/share/elasticsearch/data
ulimits: