Examination

使用dify.ai构建考试问答机器人

发现问卷星没有强制使用https,本地配置hosts

1
127.0.0.1	kaoshi.wjx.top

实现一个代理服务,注入我们的js代码,实现答题时访问dify api 获取答案,将答案拼接在问题下面。将原来的地址改成http访问,也省需要mitm了。
当然也需要将代理返回内容里的https改成http,否则就访问不了了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from http.server import HTTPServer, BaseHTTPRequestHandler
import urllib.request as urllib2
import logging
import re
import gzip
import logging
import sys
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

fh = logging.FileHandler(sys.path[0] + '/考试.log',mode='a', encoding='utf-8')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

# logging.basicConfig()

class ProxyHandler(BaseHTTPRequestHandler):
target_host = 'kaoshi.wjx.top'

def proxy(self):
header = dict(self.headers)
logger.info('header: %s', header)
# print(header)
if self.path == '/npm/eruda':
self.send_response(200)
self.end_headers()
with open(sys.path[0] + '/eruda.js', 'r', encoding='utf-8') as f:
self.wfile.write(f.read().encode('utf-8'))
return
elif self.path == '/script.js':
self.send_response(200)
self.end_headers()
with open(sys.path[0] + '/script.js', 'r', encoding='utf-8') as f:
self.wfile.write(f.read().encode('utf-8'))
return
else:
header['Host'] = 'kaoshi.wjx.top'
request = urllib2.Request(url='http://' + "101.37.44.53" + self.path, headers=header, method=self.command)
# print(request.full_url)
logger.info('request: %s', request.full_url)
with urllib2.urlopen(request) as f:
# print(f.status, f.headers)
logger.info("status: %s", f.status)
logger.info("headers: %s", f.headers)

self.send_response(f.status)
for (key, value) in f.headers.items():
if key == 'Content-Type':
self.send_header(key, value)
self.end_headers()
data = f.read()
if f.headers['Content-Encoding'] == 'gzip':
data = gzip.decompress(data)
data = data.decode('utf-8')
# print(data)
if 'https' in data:
data = data.replace('https://', 'http://')
logger.info("data: %s", data)
logger.info('<<<<: %s', self.path)
data = get_answer(data)
self.wfile.write(data.encode('utf-8'))


def do_GET(self):
self.proxy()


def do_POST(self):
self.proxy()

def get_answer(data: str):
if '</body>' in data:
i = data.index('</body>')
script = '''
<script src="//kaoshi.wjx.top/npm/eruda"></script><script>setTimeout(() => eruda.init(), 3000);</script>
'''
script = script + '''<script src="//kaoshi.wjx.top/script.js"></script>'''
data = data[:i] + script + data[i:]
return data

def run():
server_address = ('', 80)
httpd = HTTPServer(server_address, ProxyHandler)
httpd.serve_forever()

if __name__ == '__main__':
run()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
document.addEventListener("DOMContentLoaded", (event) => {
let next = document.querySelector('a[onclick="show_next_page();"]');
next.addEventListener('click', (event) => {
event.preventDefault();
show_next_page();
let q = document.querySelector('fieldset[style=""]');
if (q.page != 1) {
t = q.querySelector('div.topichtml').innerHTML;
o = Array.from(q.querySelectorAll('div.label')).map((e) => e.innerHTML);
console.log(t);
console.log(o);
fetch('https://api.dify.ai/v1/chat-messages', {
method: 'POST',
headers: {
'Authorization': 'Bearer xxxxxx',
'Content-Type': 'application/json'
},
body: JSON.stringify({
"inputs": {},
// "query": "党员领导干部的配偶、子女及其配偶,违反有关规定在该党员领导干部管辖的地区和业务范围内从事可能影响其公正执行公务的经营活动,或者有其他违反经商办企业禁止规定行为的,该党员领导干部应当按照规定予以纠正;拒不纠正的,()。\nA.其本人应当辞去现任职务\nB.由组织予以调整职务\nC.其本人应当辞去现任职务或者由组织予以调整职务\nD.直接撤销党内职务处分\n答案是什么?",
"query": t + '\n' + o.join('\n') + '\n答案是什么?',
"response_mode": "blocking",
"conversation_id": "",
"user": "abc-123",
"files": [
]
})
})
.then(response => response.json())
.then(data => {
console.log(data);
r = JSON.parse(data)['answers'];
p = document.createElement('p');
p.innerHTML = r;
q.querySelector('div.topichtml').append(p);
});
}
});
});

interrupt

Java线程的interrupt方法本质上是在线程对象上打了一个标记。实际上如果线程本身愿意的话,是可以忽略这个标记的。但是如果线程是处于阻塞状态,那么这个标记会使线程从阻塞状态中退出,并抛出一个InterruptedException异常。这个异常是一个检查异常,所以在调用interrupt方法的时候,要么捕获这个异常,要么抛出这个异常。这个异常的抛出会清除掉线程的中断标记。

利用interrupt可以实现一个可以终止的异步线程管理。

OpenAPI

OpenAPI 规范(OAS)是一种通用的、和编程语言无关的 API 描述规范,使人类和计算机都可以发现和理解服务的功能,而无需访问源代码、文档或针对接口进行嗅探。正确定义后,使用者可以使用最少的实现逻辑来理解远程服务并与之交互。

OpenAPI 始于 Swagger 规范,Swagger 规范已于 2015 年捐赠给 Linux 基金会后改名为 OpenAPI,并定义最新的规范为 OpenAPI 3.0。

可以使用文本编辑器编写yaml按照规范的方式来定义API,也可以使用Swagger Editor在线编辑器来编辑。

定义好的API可以使用APIFOX或者RapiPdf生成接口文档,Swagger Editor也可以生成各种代码的server以及client代码。

更多的相关工具openapi.tools

问题:

  1. 结合json-schema,是否可以实现一个类似soap或者protobuf的接口定义?使用框架生产服务端客户端接口代码,并在框架层面进行参数校验。

WIKI RAG QA

wikiQA是一个问答系统,用于从confluence中获取文档,生成文档向量,然后用llama3-chinese-alpaca回答问题。

wiki_qa.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import ConfluenceLoader
import requests
import sys
import os
import json
import pickle
import pathlib
import concurrent.futures
from requests.cookies import cookiejar_from_dict

class WiKi_QA:
def __init__(self):
self.wiki_url = "xxxxx"
self.embedding = OllamaEmbeddings(model='smartcreation/dmeta-embedding-zh:f16')

def gen_vectors(self, jsessionid: str = None, space_key: str = None, page_ids: str = None):
print('gen_vectors', jsessionid, space_key, page_ids)
if not page_ids and space_key == 'healthy' and pathlib.Path(os.path.split(os.path.realpath(__file__))[0] + '/persist').exists():
print('use presist')
self.vectordb = Chroma(persist_directory=os.path.split(os.path.realpath(__file__))[0] + '/persist', embedding_function=self.embedding)
else:
s = requests.Session()
s.cookies = cookiejar_from_dict({
'JSESSIONID': jsessionid
})
# s.proxies = {'http': 'http://127.0.0.1:9999', 'https': 'http://127.0.0.1:9999'}

loader = ConfluenceLoader(
url=self.wiki_url,
session= s,
cloud=False,
# space_key="healthy",
space_key=None if space_key == '' else space_key,
page_ids= page_ids.split(',') if page_ids else None,
limit=1,
max_pages=99999999
)
documents = loader.load()
print(len(documents))
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, add_start_index=True
)
texts = text_splitter.split_documents(documents)
print(len(texts))
self.vectordb = Chroma.from_documents(documents=texts, embedding=self.embedding)


def fetch_health_docs(self, jsessionid: str):
with open(sys.path[0] + '/page_ids.json', 'r', encoding='utf-8') as f:
page_ids = json.loads(f.read())

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(self.fetch_docs, jsessionid, page_id): page_id for page_id in page_ids}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))

def fetch_docs(self, jsessionid: str, page_id: str):
print('start', page_id)
s = requests.Session()
s.cookies = cookiejar_from_dict({
'JSESSIONID': jsessionid
})
loader = ConfluenceLoader(
url=self.wiki_url,
session= s,
cloud=False,
page_ids=[page_id,],
limit=1,
max_pages=99999999
)
documents = loader.lazy_load()
index = 0
for one in documents:
print(one)
with open(sys.path[0] + f'/doc/{page_id}_{index}.pkl', 'wb') as f:
pickle.dump(one, f)
index = index + 1

def gen_healthy_vectors(self):
documents = []
for plk in pathlib.Path(sys.path[0] + '/doc').iterdir():
with open(sys.path[0] + f'/doc/{plk.name}', 'rb') as f:
documents.append(pickle.load(f))
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, add_start_index=True
)
texts = text_splitter.split_documents(documents)
print(len(texts))
self.vectordb = Chroma.from_documents(documents=texts, embedding=self.embedding, persist_directory=sys.path[0] + '/persist')

def retreival_qa_chain(self):
self.retriever = self.vectordb.as_retriever(search_kwargs={"k":8})
self.llm = ChatOllama(model='lgkt/llama3-chinese-alpaca', temperature=0.)

system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the "
"answer concise."
"\n\n"
"{context}"
)

prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)

question_answer_chain = create_stuff_documents_chain(self.llm, prompt)
self.chain = create_retrieval_chain(self.retriever, question_answer_chain)

def answer_confluence(self,question:str) ->str:
answer = self.chain.invoke({"input": question})
return answer

app.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import streamlit as st
from wiki_qa import WiKi_QA

st.set_page_config(
page_title='Wiki Q&A',
page_icon='📚📚📚📚',
layout='wide',
initial_sidebar_state='auto',
)
if "config" not in st.session_state:
st.session_state["config"] = {}
if "qa" not in st.session_state:
st.session_state["qa"] = None

@st.cache_resource
def load_confluence(config):
qa = WiKi_QA()
qa.gen_vectors(config['jsession_id'], config['space_key'], config['page_ids'])
qa.retreival_qa_chain()
return qa

with st.sidebar.form(key ='Form1'):
st.markdown('## 使用配置')
jsession_id = st.text_input(label="jsessionid",
help="F12获取jsessionid")
space_key = st.text_input(label="空间",
help="wiki的空间",
value="healthy")
page_ids = st.text_input(label="页面id",
help="多个页面id用逗号分隔")
submitted1 = st.form_submit_button(label='Submit')

if submitted1:
st.session_state["config"] = {
"jsession_id": jsession_id if jsession_id != "" else None,
"page_ids": page_ids if page_ids != "" else None,
"space_key": space_key,
}
with st.spinner(text="..."):
config = st.session_state["config"]
st.session_state["config"] = config
st.session_state["qa"] = load_confluence(st.session_state["config"])
st.write("Ingested")


st.title("WIKI Q&A")

question = st.text_input('问一个问题', "商品中心有哪两部分组成?")

if st.button('获取答案', key='button2'):
with st.spinner(text="..."):
qa = st.session_state.get("qa")
if qa is not None:
result = qa.answer_confluence(question)
st.write(result)
else:
st.write("请先设置")

llama3回答都是英文,因而改用lgkt/llama3-chinese-alpaca,但中文返回结果感觉比较单薄。

问题:

  1. 为什么要用RecursiveCharacterTextSplitter?与其他的splitter有什么区别?还有哪些splitter?chunk_sizechunk_overlap如何设置?
  2. retriever返回的结果有时明显不准确,如何提高准确率?
  3. embedding的model如何选择?对中文有影响吗?

Build a Retrieval Augmented Generation (RAG) App
Building a Confluence Q&A App with LangChain and ChatGPT
RAG行业交流中发现的一些问题和改进方法