上传文件至 /
This commit is contained in:
230
ESConnect.py
Normal file
230
ESConnect.py
Normal file
@@ -0,0 +1,230 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import requests
|
||||
import json
|
||||
|
||||
# Elasticsearch连接配置
|
||||
ES_URL = "http://localhost:9200"
|
||||
AUTH = None # 如需认证则改为("用户名","密码")
|
||||
|
||||
# document=os.open('results/output.json', os.O_RDONLY)
|
||||
|
||||
# 创建Elasticsearch客户端实例,连接到本地Elasticsearch服务
|
||||
es = Elasticsearch(["http://localhost:9200"])
|
||||
|
||||
# 定义索引名称和类型名称
|
||||
index_name = "wordsearch2"
|
||||
|
||||
def create_index_with_mapping():
|
||||
"""修正后的索引映射配置"""
|
||||
# 修正映射结构(移除keyword字段的非法参数)
|
||||
mapping = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "text", # 改为text类型支持分词
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"students": {"type": "keyword"}, # 仅保留type参数
|
||||
"teacher": {"type": "keyword"}, # 仅保留type参数
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "strict_date_optional_time||epoch_millis"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 检查索引是否存在,不存在则创建
|
||||
if not es.indices.exists(index=index_name):
|
||||
es.indices.create(index=index_name, body=mapping)
|
||||
print(f"创建索引 {index_name} 并设置映射")
|
||||
else:
|
||||
print(f"索引 {index_name} 已存在")
|
||||
|
||||
|
||||
|
||||
def get_doc_id(data):
|
||||
"""
|
||||
根据关键字段生成唯一ID(用于去重)
|
||||
可以根据实际需求调整字段组合
|
||||
|
||||
参数:
|
||||
data (dict): 包含文档数据的字典
|
||||
|
||||
返回:
|
||||
str: 基于数据内容生成的MD5哈希值作为唯一ID
|
||||
"""
|
||||
# 组合关键字段生成唯一字符串
|
||||
unique_str = f"{data['id']}{data['name']}{data['students']}{data['teacher']}"
|
||||
# 使用MD5哈希生成唯一ID
|
||||
return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
def insert_data(data):
|
||||
"""
|
||||
向Elasticsearch插入数据
|
||||
|
||||
参数:
|
||||
data (dict): 要插入的数据
|
||||
|
||||
返回:
|
||||
bool: 插入成功返回True,失败返回False
|
||||
"""
|
||||
# 生成文档唯一ID
|
||||
return batch_write_data(data)
|
||||
|
||||
|
||||
def search_data(query):
|
||||
"""
|
||||
在Elasticsearch中搜索数据
|
||||
|
||||
参数:
|
||||
query (str): 搜索关键词
|
||||
|
||||
返回:
|
||||
list: 包含搜索结果的列表,每个元素是一个文档的源数据
|
||||
"""
|
||||
# 执行多字段匹配搜索
|
||||
result = es.search(index=index_name, body={"query": {"multi_match": {"query": query, "fields": ["*"]}}})
|
||||
# 返回搜索结果的源数据部分
|
||||
return [hit["_source"] for hit in result['hits']['hits']]
|
||||
|
||||
def search_all():
|
||||
"""
|
||||
获取所有文档
|
||||
|
||||
返回:
|
||||
list: 包含所有文档的列表,每个元素包含文档ID和源数据
|
||||
"""
|
||||
# 执行匹配所有文档的查询
|
||||
result = es.search(index=index_name, body={"query": {"match_all": {}}})
|
||||
# 返回包含文档ID和源数据的列表
|
||||
return [{
|
||||
"_id": hit["_id"],
|
||||
**hit["_source"]
|
||||
} for hit in result['hits']['hits']]
|
||||
|
||||
def delete_by_id(doc_id):
|
||||
"""
|
||||
根据 doc_id 删除文档
|
||||
|
||||
参数:
|
||||
doc_id (str): 要删除的文档ID
|
||||
|
||||
返回:
|
||||
bool: 删除成功返回True,失败返回False
|
||||
"""
|
||||
try:
|
||||
# 执行删除操作
|
||||
es.delete(index=index_name, id=doc_id)
|
||||
return True
|
||||
except Exception as e:
|
||||
print("删除失败:", str(e))
|
||||
return False
|
||||
|
||||
def search_by_any_field(keyword):
|
||||
"""全字段模糊搜索(支持拼写错误)"""
|
||||
try:
|
||||
# update_mapping()
|
||||
response = requests.post(
|
||||
f"{ES_URL}/{index_name}/_search",
|
||||
auth=AUTH,
|
||||
json={
|
||||
"query": {
|
||||
"multi_match": {
|
||||
"query": keyword,
|
||||
"fields": ["*"], # 匹配所有字段
|
||||
"fuzziness": "AUTO", # 启用模糊匹配
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
results = response.json()["hits"]["hits"]
|
||||
print(f"\n模糊搜索 '{keyword}' 找到 {len(results)} 条结果:")
|
||||
|
||||
for doc in results:
|
||||
print(f"\n文档ID: {doc['_id']}")
|
||||
if '_source' in doc:
|
||||
max_key_len = max(len(k) for k in doc['_source'].keys())
|
||||
for key, value in doc['_source'].items():
|
||||
# 提取高亮部分
|
||||
highlight = doc.get('highlight', {}).get(key, [value])[0]
|
||||
print(f"{key:>{max_key_len + 2}} : {highlight}")
|
||||
else:
|
||||
print("无_source数据")
|
||||
|
||||
return results
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"搜索失败: {e.response.text}")
|
||||
return []
|
||||
|
||||
def batch_write_data(data):
|
||||
"""批量写入获奖数据"""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{ES_URL}/{index_name}/_doc",
|
||||
json=data,
|
||||
auth=AUTH,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
doc_id = response.json()["_id"]
|
||||
print(f"文档写入成功,ID: {doc_id}, 内容: {data}")
|
||||
return True
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"文档写入失败: {e.response.text}, 数据: {data}")
|
||||
return False
|
||||
|
||||
def update_mapping():
|
||||
# 定义新的映射配置
|
||||
new_mapping = {
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word"
|
||||
},
|
||||
"students": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"teacher": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 执行PUT请求更新映射
|
||||
try:
|
||||
response = requests.put(
|
||||
f"{ES_URL}/{index_name}/_mapping",
|
||||
auth=AUTH,
|
||||
json=new_mapping,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
print("索引映射更新成功")
|
||||
print(response.json())
|
||||
|
||||
# 验证映射更新结果
|
||||
verify = requests.get(
|
||||
f"{ES_URL}/{index_name}/_mapping",
|
||||
auth=AUTH
|
||||
)
|
||||
print("\n验证结果:")
|
||||
print(verify.json())
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"请求失败: {e.response.text}")
|
||||
10
ESTest.py
Normal file
10
ESTest.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
# 连接本地的 Elasticsearch 实例
|
||||
es = Elasticsearch(["http://localhost:9200"])
|
||||
|
||||
# 检查连接是否成功
|
||||
if es.ping():
|
||||
print("连接成功!")
|
||||
else:
|
||||
print("连接失败!")
|
||||
228
app.py
Normal file
228
app.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import base64
|
||||
from flask import Flask, request, render_template, redirect, url_for, jsonify
|
||||
import os
|
||||
import uuid
|
||||
from PIL import Image
|
||||
import re
|
||||
import json
|
||||
from ESConnect import *
|
||||
from openai import OpenAI
|
||||
# import config
|
||||
|
||||
# 创建Flask应用实例
|
||||
app = Flask(__name__)
|
||||
# app.config.from_object(config.Config)
|
||||
|
||||
# OCR和信息提取函数,使用大模型API处理图片并提取结构化信息
|
||||
def ocr_and_extract_info(image_path):
|
||||
"""
|
||||
使用大模型API进行OCR识别并提取图片中的结构化信息
|
||||
|
||||
参数:
|
||||
image_path (str): 图片文件路径
|
||||
|
||||
返回:
|
||||
dict: 包含提取信息的字典,格式为 {'id': '', 'name': '', 'students': '', 'teacher': ''}
|
||||
"""
|
||||
def encode_image(image_path):
|
||||
"""
|
||||
将图片编码为base64格式
|
||||
|
||||
参数:
|
||||
image_path (str): 图片文件路径
|
||||
|
||||
返回:
|
||||
str: base64编码的图片字符串
|
||||
"""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
# 将图片转换为base64编码
|
||||
base64_image = encode_image(image_path)
|
||||
|
||||
# 初始化OpenAI客户端,使用百度AI Studio的API
|
||||
client = OpenAI(
|
||||
api_key="188f57db3766e02ed2c7e18373996d84f4112272",
|
||||
# 含有 AI Studio 访问令牌的环境变量,https://aistudio.baidu.com/account/accessToken,
|
||||
base_url="https://aistudio.baidu.com/llm/lmapi/v3", # aistudio 大模型 api 服务域名
|
||||
)
|
||||
|
||||
# 调用大模型API进行图片识别和信息提取
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{'role': 'system', 'content': '你是一个能理解图片和文本的助手,请根据用户提供的信息进行回答。'},
|
||||
{'role': 'user', "content": [
|
||||
{"type": "text", "text": "请识别这张图片中的信息,只显示json不显示其它信息便于解析"
|
||||
"以JSON格式返回(id对应比赛名称或论文名称,name对应项目名称,students对应参赛学生,teacher对应指导老师,出现多个名字用列表存储)"
|
||||
":{'id':'', 'name':'','students':'','teacher':''}"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{base64_image}"
|
||||
}
|
||||
}
|
||||
]}
|
||||
],
|
||||
model="ernie-4.5-turbo-vl-32k", # 使用百度文心大模型
|
||||
)
|
||||
|
||||
# 获取API返回的文本内容
|
||||
response_text = chat_completion.choices[0].message.content
|
||||
|
||||
def parse_respound(text):
|
||||
"""
|
||||
解析API返回的文本,提取JSON数据
|
||||
|
||||
参数:
|
||||
text (str): API返回的文本
|
||||
|
||||
返回:
|
||||
dict or None: 解析成功返回字典,失败返回None
|
||||
"""
|
||||
# 尝试直接解析标准JSON
|
||||
try:
|
||||
result=json.loads(text)
|
||||
if result:
|
||||
print("success")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法解析标准json")
|
||||
pass
|
||||
|
||||
# 提取markdown代码块中的内容
|
||||
code_block = re.search(r'```json\n(.*?)```', text, re.DOTALL)
|
||||
if code_block:
|
||||
try:
|
||||
result=json.loads(code_block.group(1))
|
||||
if result:
|
||||
print("success")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法解析markdown")
|
||||
pass
|
||||
|
||||
# 尝试替换单引号并解析
|
||||
try:
|
||||
fixed_json = text.replace("'", "\"")
|
||||
result=json.loads(fixed_json)
|
||||
if(result):
|
||||
print("success")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法替换单引号")
|
||||
pass
|
||||
|
||||
# 解析API返回的文本
|
||||
result_data = parse_respound(response_text)
|
||||
return result_data
|
||||
|
||||
"""
|
||||
模拟大模型识别图像并返回结构化JSON。
|
||||
实际应调用Qwen-VL或其他OCR+解析服务。
|
||||
"""
|
||||
|
||||
|
||||
# 首页路由
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""
|
||||
渲染首页模板
|
||||
|
||||
返回:
|
||||
str: 渲染后的HTML页面
|
||||
"""
|
||||
return render_template('index.html')
|
||||
|
||||
# 图片上传路由
|
||||
@app.route('/upload', methods=['POST'])
|
||||
def upload_image():
|
||||
"""
|
||||
处理图片上传请求,调用OCR识别并存储结果
|
||||
|
||||
返回:
|
||||
JSON: 上传成功或失败的响应
|
||||
"""
|
||||
# 获取上传的文件
|
||||
file = request.files.get('file')
|
||||
if not file:
|
||||
return jsonify({"error": "No file uploaded"}), 400
|
||||
|
||||
# 保存上传的图片
|
||||
filename = f"{uuid.uuid4()}_{file.filename}"
|
||||
image_path = os.path.join("image", filename)
|
||||
file.save(image_path)
|
||||
|
||||
# 调用大模型进行识别
|
||||
try:
|
||||
data = ocr_and_extract_info(image_path) # 替换为真实AI接口调用
|
||||
insert_data(data) # 存入ES
|
||||
return jsonify({"message": "成功录入", "data": data})
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# 搜索路由
|
||||
@app.route('/search')
|
||||
def search():
|
||||
"""
|
||||
处理搜索请求,从Elasticsearch中检索匹配的数据
|
||||
|
||||
返回:
|
||||
JSON: 搜索结果列表
|
||||
"""
|
||||
keyword = request.args.get('q')
|
||||
if not keyword:
|
||||
return jsonify([])
|
||||
results = search_by_any_field(keyword)
|
||||
print(results)
|
||||
return jsonify(results)
|
||||
|
||||
# 结果页面路由
|
||||
@app.route('/results')
|
||||
def results_page():
|
||||
"""
|
||||
渲染搜索结果页面
|
||||
|
||||
返回:
|
||||
str: 渲染后的HTML页面
|
||||
"""
|
||||
return render_template('results.html')
|
||||
|
||||
# 显示所有数据路由
|
||||
@app.route('/all')
|
||||
def show_all():
|
||||
"""
|
||||
获取所有数据并渲染到页面
|
||||
|
||||
返回:
|
||||
str: 渲染后的HTML页面,包含所有数据
|
||||
"""
|
||||
all_data = search_all()
|
||||
return render_template('all.html', data=all_data)
|
||||
|
||||
# 删除数据路由
|
||||
@app.route('/delete/<doc_id>', methods=['POST'])
|
||||
def delete_entry(doc_id):
|
||||
"""
|
||||
根据文档ID删除数据
|
||||
|
||||
参数:
|
||||
doc_id (str): 要删除的文档ID
|
||||
|
||||
返回:
|
||||
重定向到所有数据页面或错误信息
|
||||
"""
|
||||
if delete_by_id(doc_id):
|
||||
return redirect(url_for('show_all'))
|
||||
else:
|
||||
return "删除失败", 500
|
||||
|
||||
|
||||
|
||||
# 主程序入口
|
||||
if __name__ == '__main__':
|
||||
# 创建Elasticsearch索引
|
||||
create_index_with_mapping()
|
||||
# 创建图片存储目录
|
||||
os.makedirs("image", exist_ok=True)
|
||||
# 启动Flask应用
|
||||
app.run(use_reloader=False)
|
||||
Reference in New Issue
Block a user