修改数据结构
This commit is contained in:
71
ESConnect.py
71
ESConnect.py
@@ -15,7 +15,7 @@ AUTH = None # 如需认证则改为("用户名","密码")
|
||||
es = Elasticsearch(["http://localhost:9200"])
|
||||
|
||||
# 定义索引名称和类型名称
|
||||
index_name = "wordsearch2"
|
||||
index_name = "wordsearch2666"
|
||||
|
||||
def create_index_with_mapping():
|
||||
"""修正后的索引映射配置"""
|
||||
@@ -23,22 +23,12 @@ def create_index_with_mapping():
|
||||
mapping = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "text", # 改为text类型支持分词
|
||||
"data": {
|
||||
"type": "text", # 存储转换后的字符串,支持分词搜索
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"students": {"type": "keyword"}, # 仅保留type参数
|
||||
"teacher": {"type": "keyword"}, # 仅保留type参数
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "strict_date_optional_time||epoch_millis"
|
||||
}
|
||||
"image": {"type": "keyword"}, # 存储图片路径或标识
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -54,8 +44,7 @@ def create_index_with_mapping():
|
||||
|
||||
def get_doc_id(data):
|
||||
"""
|
||||
根据关键字段生成唯一ID(用于去重)
|
||||
可以根据实际需求调整字段组合
|
||||
根据数据内容生成唯一ID(用于去重)
|
||||
|
||||
参数:
|
||||
data (dict): 包含文档数据的字典
|
||||
@@ -63,8 +52,10 @@ def get_doc_id(data):
|
||||
返回:
|
||||
str: 基于数据内容生成的MD5哈希值作为唯一ID
|
||||
"""
|
||||
# 组合关键字段生成唯一字符串
|
||||
unique_str = f"{data['id']}{data['name']}{data['students']}{data['teacher']}"
|
||||
# 使用data字段的内容生成唯一字符串
|
||||
data_str = data.get('data', '')
|
||||
image_str = data.get('image', '')
|
||||
unique_str = f"{data_str}{image_str}"
|
||||
# 使用MD5哈希生成唯一ID
|
||||
return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
|
||||
|
||||
@@ -184,47 +175,3 @@ def batch_write_data(data):
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"文档写入失败: {e.response.text}, 数据: {data}")
|
||||
return False
|
||||
|
||||
def update_mapping():
|
||||
# 定义新的映射配置
|
||||
new_mapping = {
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word",
|
||||
"search_analyzer": "ik_smart"
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_max_word"
|
||||
},
|
||||
"students": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"teacher": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 执行PUT请求更新映射
|
||||
try:
|
||||
response = requests.put(
|
||||
f"{ES_URL}/{index_name}/_mapping",
|
||||
auth=AUTH,
|
||||
json=new_mapping,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
print("索引映射更新成功")
|
||||
print(response.json())
|
||||
|
||||
# 验证映射更新结果
|
||||
verify = requests.get(
|
||||
f"{ES_URL}/{index_name}/_mapping",
|
||||
auth=AUTH
|
||||
)
|
||||
print("\n验证结果:")
|
||||
print(verify.json())
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"请求失败: {e.response.text}")
|
||||
|
||||
107
app.py
107
app.py
@@ -6,6 +6,7 @@ from PIL import Image
|
||||
import re
|
||||
import json
|
||||
from ESConnect import *
|
||||
from json_converter import json_to_string, string_to_json
|
||||
from openai import OpenAI
|
||||
# import config
|
||||
|
||||
@@ -52,9 +53,9 @@ def ocr_and_extract_info(image_path):
|
||||
messages=[
|
||||
{'role': 'system', 'content': '你是一个能理解图片和文本的助手,请根据用户提供的信息进行回答。'},
|
||||
{'role': 'user', "content": [
|
||||
{"type": "text", "text": "请识别这张图片中的信息,只显示json不显示其它信息便于解析"
|
||||
"以JSON格式返回(id对应比赛名称或论文名称,name对应项目名称,students对应参赛学生,teacher对应指导老师,出现多个名字用列表存储)"
|
||||
":{'id':'', 'name':'','students':'','teacher':''}"},
|
||||
{"type": "text", "text": "请识别这张图片中的信息,将你认为重要的数据转换为不包含嵌套的json,不要显示其它信息以便于解析"
|
||||
"直接输出json结果即可"
|
||||
"你可以自行决定使用哪些json字段"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
@@ -68,6 +69,12 @@ def ocr_and_extract_info(image_path):
|
||||
|
||||
# 获取API返回的文本内容
|
||||
response_text = chat_completion.choices[0].message.content
|
||||
|
||||
# 添加调试信息:输出模型返回的原始字符串
|
||||
print("=" * 50)
|
||||
print("模型返回的原始字符串:")
|
||||
print(response_text)
|
||||
print("=" * 50)
|
||||
|
||||
def parse_respound(text):
|
||||
"""
|
||||
@@ -83,10 +90,10 @@ def ocr_and_extract_info(image_path):
|
||||
try:
|
||||
result=json.loads(text)
|
||||
if result:
|
||||
print("success")
|
||||
print("✓ 成功解析标准JSON格式")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法解析标准json")
|
||||
print("✗ 无法解析标准JSON格式")
|
||||
pass
|
||||
|
||||
# 提取markdown代码块中的内容
|
||||
@@ -95,10 +102,10 @@ def ocr_and_extract_info(image_path):
|
||||
try:
|
||||
result=json.loads(code_block.group(1))
|
||||
if result:
|
||||
print("success")
|
||||
print("✓ 成功解析markdown代码块中的JSON")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法解析markdown")
|
||||
print("✗ 无法解析markdown代码块中的JSON")
|
||||
pass
|
||||
|
||||
# 尝试替换单引号并解析
|
||||
@@ -106,14 +113,23 @@ def ocr_and_extract_info(image_path):
|
||||
fixed_json = text.replace("'", "\"")
|
||||
result=json.loads(fixed_json)
|
||||
if(result):
|
||||
print("success")
|
||||
print("✓ 成功解析替换单引号后的JSON")
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
print("无法替换单引号")
|
||||
print("✗ 无法解析替换单引号后的JSON")
|
||||
pass
|
||||
|
||||
# 解析API返回的文本
|
||||
result_data = parse_respound(response_text)
|
||||
|
||||
# 添加调试信息:输出解析结果
|
||||
print("解析结果:")
|
||||
if result_data:
|
||||
print(f"✓ 解析成功: {result_data}")
|
||||
else:
|
||||
print("✗ 解析失败,返回None")
|
||||
print("=" * 50)
|
||||
|
||||
return result_data
|
||||
|
||||
"""
|
||||
@@ -154,10 +170,28 @@ def upload_image():
|
||||
|
||||
# 调用大模型进行识别
|
||||
try:
|
||||
data = ocr_and_extract_info(image_path) # 替换为真实AI接口调用
|
||||
insert_data(data) # 存入ES
|
||||
return jsonify({"message": "成功录入", "data": data})
|
||||
print(f"开始处理图片: {image_path}")
|
||||
original_data = ocr_and_extract_info(image_path) # 获取原始JSON数据
|
||||
if original_data:
|
||||
# 使用json_converter将JSON数据转换为字符串
|
||||
data_string = json_to_string(original_data)
|
||||
print(f"转换后的数据字符串: {data_string}")
|
||||
|
||||
# 构造新的数据结构,只包含data和image字段
|
||||
processed_data = {
|
||||
"data": data_string,
|
||||
"image": filename # 存储图片文件名
|
||||
}
|
||||
print(f"准备存储的数据: {processed_data}")
|
||||
|
||||
insert_data(processed_data) # 存入ES
|
||||
print("✓ 数据成功存储到Elasticsearch")
|
||||
return jsonify({"message": "成功录入", "data": original_data, "processed": processed_data})
|
||||
else:
|
||||
print("✗ 无法识别图片内容")
|
||||
return jsonify({"error": "无法识别图片内容"}), 400
|
||||
except Exception as e:
|
||||
print(f"✗ 处理过程中发生错误: {str(e)}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# 搜索路由
|
||||
@@ -173,8 +207,31 @@ def search():
|
||||
if not keyword:
|
||||
return jsonify([])
|
||||
results = search_by_any_field(keyword)
|
||||
print(results)
|
||||
return jsonify(results)
|
||||
|
||||
# 处理搜索结果,将data字段转换回JSON格式
|
||||
processed_results = []
|
||||
for result in results:
|
||||
if '_source' in result and 'data' in result['_source']:
|
||||
try:
|
||||
# 将data字段的字符串转换回JSON
|
||||
original_data = string_to_json(result['_source']['data'])
|
||||
# 构造新的结果格式
|
||||
processed_result = {
|
||||
'_id': result.get('_id', ''),
|
||||
'_source': {
|
||||
'image': result['_source'].get('image', ''),
|
||||
**original_data # 展开原始数据字段
|
||||
}
|
||||
}
|
||||
processed_results.append(processed_result)
|
||||
except Exception as e:
|
||||
# 如果转换失败,保持原始格式
|
||||
processed_results.append(result)
|
||||
else:
|
||||
processed_results.append(result)
|
||||
|
||||
print(processed_results)
|
||||
return jsonify(processed_results)
|
||||
|
||||
# 结果页面路由
|
||||
@app.route('/results')
|
||||
@@ -197,7 +254,27 @@ def show_all():
|
||||
str: 渲染后的HTML页面,包含所有数据
|
||||
"""
|
||||
all_data = search_all()
|
||||
return render_template('all.html', data=all_data)
|
||||
# 将data字段从字符串转换回JSON格式以便显示
|
||||
processed_data = []
|
||||
for item in all_data:
|
||||
if 'data' in item and item['data']:
|
||||
try:
|
||||
# 将data字段的字符串转换回JSON
|
||||
original_data = string_to_json(item['data'])
|
||||
# 合并原始数据和其他字段
|
||||
display_item = {
|
||||
'_id': item['_id'],
|
||||
'image': item.get('image', ''),
|
||||
**original_data # 展开原始数据字段
|
||||
}
|
||||
processed_data.append(display_item)
|
||||
except Exception as e:
|
||||
# 如果转换失败,保持原始格式
|
||||
processed_data.append(item)
|
||||
else:
|
||||
processed_data.append(item)
|
||||
|
||||
return render_template('all.html', data=processed_data)
|
||||
|
||||
# 删除数据路由
|
||||
@app.route('/delete/<doc_id>', methods=['POST'])
|
||||
|
||||
100
json_converter.py
Normal file
100
json_converter.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import json
|
||||
|
||||
|
||||
def json_to_string(json_data):
|
||||
"""
|
||||
将JSON数据转换为使用指定分隔符的字符串
|
||||
使用 |###| 作为键值对分隔符
|
||||
使用 |##| 作为列表元素分隔符
|
||||
|
||||
Args:
|
||||
json_data (dict): 要转换的JSON数据
|
||||
|
||||
Returns:
|
||||
str: 转换后的字符串
|
||||
"""
|
||||
if not isinstance(json_data, dict):
|
||||
raise ValueError("输入必须是字典类型")
|
||||
|
||||
result_parts = []
|
||||
|
||||
for key, value in json_data.items():
|
||||
if isinstance(value, list):
|
||||
# 处理列表:使用 |##| 分隔列表元素
|
||||
list_str = "|##|".join(str(item) for item in value)
|
||||
result_parts.append(f"{key}:[{list_str}]")
|
||||
else:
|
||||
# 处理普通值
|
||||
result_parts.append(f"{key}:{value}")
|
||||
|
||||
# 使用 |###| 分隔键值对
|
||||
return "|###|".join(result_parts)
|
||||
|
||||
|
||||
def string_to_json(data_string):
|
||||
"""
|
||||
将使用指定分隔符的字符串转换回JSON格式
|
||||
解析使用 |###| 分隔的键值对
|
||||
解析使用 |##| 分隔的列表元素
|
||||
|
||||
Args:
|
||||
data_string (str): 要转换的字符串
|
||||
|
||||
Returns:
|
||||
dict: 转换后的JSON数据
|
||||
"""
|
||||
if not isinstance(data_string, str):
|
||||
raise ValueError("输入必须是字符串类型")
|
||||
|
||||
if not data_string.strip():
|
||||
return {}
|
||||
|
||||
result = {}
|
||||
|
||||
# 使用 |###| 分割键值对
|
||||
pairs = data_string.split("|###|")
|
||||
|
||||
for pair in pairs:
|
||||
if ":" not in pair:
|
||||
continue
|
||||
|
||||
# 分割键和值
|
||||
key, value = pair.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# 检查是否是列表格式 [...]
|
||||
if value.startswith("[") and value.endswith("]"):
|
||||
# 处理列表
|
||||
list_content = value[1:-1] # 去掉方括号
|
||||
if list_content:
|
||||
# 使用 |##| 分割列表元素
|
||||
items = list_content.split("|##|")
|
||||
# 尝试转换为适当的数据类型
|
||||
converted_items = []
|
||||
for item in items:
|
||||
item = item.strip()
|
||||
# 尝试转换为数字
|
||||
try:
|
||||
if "." in item:
|
||||
converted_items.append(float(item))
|
||||
else:
|
||||
converted_items.append(int(item))
|
||||
except ValueError:
|
||||
# 如果不是数字,保持为字符串
|
||||
converted_items.append(item)
|
||||
result[key] = converted_items
|
||||
else:
|
||||
result[key] = []
|
||||
else:
|
||||
# 处理普通值,尝试转换为适当的数据类型
|
||||
try:
|
||||
if "." in value:
|
||||
result[key] = float(value)
|
||||
else:
|
||||
result[key] = int(value)
|
||||
except ValueError:
|
||||
# 如果不是数字,保持为字符串
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
@@ -1,5 +1,6 @@
|
||||
flask==3.1.1
|
||||
pillow==11.1.0
|
||||
openai==1.88.0
|
||||
elasticsearch==7.17.0
|
||||
pandas==2.2.3
|
||||
flask==3.1.1
|
||||
pillow==11.1.0
|
||||
openai==1.88.0
|
||||
elasticsearch==7.17.0
|
||||
pandas==2.2.3
|
||||
requests
|
||||
Reference in New Issue
Block a user