diff --git a/ESConnect.py b/ESConnect.py index 9833291..3c93b03 100644 --- a/ESConnect.py +++ b/ESConnect.py @@ -15,7 +15,7 @@ AUTH = None # 如需认证则改为("用户名","密码") es = Elasticsearch(["http://localhost:9200"]) # 定义索引名称和类型名称 -index_name = "wordsearch2" +index_name = "wordsearch2666" def create_index_with_mapping(): """修正后的索引映射配置""" @@ -23,22 +23,12 @@ def create_index_with_mapping(): mapping = { "mappings": { "properties": { - "id": { - "type": "text", # 改为text类型支持分词 + "data": { + "type": "text", # 存储转换后的字符串,支持分词搜索 "analyzer": "ik_max_word", "search_analyzer": "ik_smart" }, - "name": { - "type": "text", - "analyzer": "ik_max_word", - "search_analyzer": "ik_smart" - }, - "students": {"type": "keyword"}, # 仅保留type参数 - "teacher": {"type": "keyword"}, # 仅保留type参数 - "timestamp": { - "type": "date", - "format": "strict_date_optional_time||epoch_millis" - } + "image": {"type": "keyword"}, # 存储图片路径或标识 } } } @@ -54,8 +44,7 @@ def create_index_with_mapping(): def get_doc_id(data): """ - 根据关键字段生成唯一ID(用于去重) - 可以根据实际需求调整字段组合 + 根据数据内容生成唯一ID(用于去重) 参数: data (dict): 包含文档数据的字典 @@ -63,8 +52,10 @@ def get_doc_id(data): 返回: str: 基于数据内容生成的MD5哈希值作为唯一ID """ - # 组合关键字段生成唯一字符串 - unique_str = f"{data['id']}{data['name']}{data['students']}{data['teacher']}" + # 使用data字段的内容生成唯一字符串 + data_str = data.get('data', '') + image_str = data.get('image', '') + unique_str = f"{data_str}{image_str}" # 使用MD5哈希生成唯一ID return hashlib.md5(unique_str.encode('utf-8')).hexdigest() @@ -184,47 +175,3 @@ def batch_write_data(data): except requests.exceptions.HTTPError as e: print(f"文档写入失败: {e.response.text}, 数据: {data}") return False - -def update_mapping(): - # 定义新的映射配置 - new_mapping = { - "properties": { - "id": { - "type": "text", - "analyzer": "ik_max_word", - "search_analyzer": "ik_smart" - }, - "name": { - "type": "text", - "analyzer": "ik_max_word" - }, - "students": { - "type": "keyword" - }, - "teacher": { - "type": "keyword" - } - } - } - - # 执行PUT请求更新映射 - try: - response = requests.put( - f"{ES_URL}/{index_name}/_mapping", - auth=AUTH, - json=new_mapping, - headers={"Content-Type": "application/json"} - ) - response.raise_for_status() - print("索引映射更新成功") - print(response.json()) - - # 验证映射更新结果 - verify = requests.get( - f"{ES_URL}/{index_name}/_mapping", - auth=AUTH - ) - print("\n验证结果:") - print(verify.json()) - except requests.exceptions.HTTPError as e: - print(f"请求失败: {e.response.text}") diff --git a/app.py b/app.py index b84c62b..2d5b7f7 100644 --- a/app.py +++ b/app.py @@ -6,6 +6,7 @@ from PIL import Image import re import json from ESConnect import * +from json_converter import json_to_string, string_to_json from openai import OpenAI # import config @@ -52,9 +53,9 @@ def ocr_and_extract_info(image_path): messages=[ {'role': 'system', 'content': '你是一个能理解图片和文本的助手,请根据用户提供的信息进行回答。'}, {'role': 'user', "content": [ - {"type": "text", "text": "请识别这张图片中的信息,只显示json不显示其它信息便于解析" - "以JSON格式返回(id对应比赛名称或论文名称,name对应项目名称,students对应参赛学生,teacher对应指导老师,出现多个名字用列表存储)" - ":{'id':'', 'name':'','students':'','teacher':''}"}, + {"type": "text", "text": "请识别这张图片中的信息,将你认为重要的数据转换为不包含嵌套的json,不要显示其它信息以便于解析" + "直接输出json结果即可" + "你可以自行决定使用哪些json字段"}, { "type": "image_url", "image_url": { @@ -68,6 +69,12 @@ def ocr_and_extract_info(image_path): # 获取API返回的文本内容 response_text = chat_completion.choices[0].message.content + + # 添加调试信息:输出模型返回的原始字符串 + print("=" * 50) + print("模型返回的原始字符串:") + print(response_text) + print("=" * 50) def parse_respound(text): """ @@ -83,10 +90,10 @@ def ocr_and_extract_info(image_path): try: result=json.loads(text) if result: - print("success") + print("✓ 成功解析标准JSON格式") return result except json.JSONDecodeError: - print("无法解析标准json") + print("✗ 无法解析标准JSON格式") pass # 提取markdown代码块中的内容 @@ -95,10 +102,10 @@ def ocr_and_extract_info(image_path): try: result=json.loads(code_block.group(1)) if result: - print("success") + print("✓ 成功解析markdown代码块中的JSON") return result except json.JSONDecodeError: - print("无法解析markdown") + print("✗ 无法解析markdown代码块中的JSON") pass # 尝试替换单引号并解析 @@ -106,14 +113,23 @@ def ocr_and_extract_info(image_path): fixed_json = text.replace("'", "\"") result=json.loads(fixed_json) if(result): - print("success") + print("✓ 成功解析替换单引号后的JSON") return result except json.JSONDecodeError: - print("无法替换单引号") + print("✗ 无法解析替换单引号后的JSON") pass # 解析API返回的文本 result_data = parse_respound(response_text) + + # 添加调试信息:输出解析结果 + print("解析结果:") + if result_data: + print(f"✓ 解析成功: {result_data}") + else: + print("✗ 解析失败,返回None") + print("=" * 50) + return result_data """ @@ -154,10 +170,28 @@ def upload_image(): # 调用大模型进行识别 try: - data = ocr_and_extract_info(image_path) # 替换为真实AI接口调用 - insert_data(data) # 存入ES - return jsonify({"message": "成功录入", "data": data}) + print(f"开始处理图片: {image_path}") + original_data = ocr_and_extract_info(image_path) # 获取原始JSON数据 + if original_data: + # 使用json_converter将JSON数据转换为字符串 + data_string = json_to_string(original_data) + print(f"转换后的数据字符串: {data_string}") + + # 构造新的数据结构,只包含data和image字段 + processed_data = { + "data": data_string, + "image": filename # 存储图片文件名 + } + print(f"准备存储的数据: {processed_data}") + + insert_data(processed_data) # 存入ES + print("✓ 数据成功存储到Elasticsearch") + return jsonify({"message": "成功录入", "data": original_data, "processed": processed_data}) + else: + print("✗ 无法识别图片内容") + return jsonify({"error": "无法识别图片内容"}), 400 except Exception as e: + print(f"✗ 处理过程中发生错误: {str(e)}") return jsonify({"error": str(e)}), 500 # 搜索路由 @@ -173,8 +207,31 @@ def search(): if not keyword: return jsonify([]) results = search_by_any_field(keyword) - print(results) - return jsonify(results) + + # 处理搜索结果,将data字段转换回JSON格式 + processed_results = [] + for result in results: + if '_source' in result and 'data' in result['_source']: + try: + # 将data字段的字符串转换回JSON + original_data = string_to_json(result['_source']['data']) + # 构造新的结果格式 + processed_result = { + '_id': result.get('_id', ''), + '_source': { + 'image': result['_source'].get('image', ''), + **original_data # 展开原始数据字段 + } + } + processed_results.append(processed_result) + except Exception as e: + # 如果转换失败,保持原始格式 + processed_results.append(result) + else: + processed_results.append(result) + + print(processed_results) + return jsonify(processed_results) # 结果页面路由 @app.route('/results') @@ -197,7 +254,27 @@ def show_all(): str: 渲染后的HTML页面,包含所有数据 """ all_data = search_all() - return render_template('all.html', data=all_data) + # 将data字段从字符串转换回JSON格式以便显示 + processed_data = [] + for item in all_data: + if 'data' in item and item['data']: + try: + # 将data字段的字符串转换回JSON + original_data = string_to_json(item['data']) + # 合并原始数据和其他字段 + display_item = { + '_id': item['_id'], + 'image': item.get('image', ''), + **original_data # 展开原始数据字段 + } + processed_data.append(display_item) + except Exception as e: + # 如果转换失败,保持原始格式 + processed_data.append(item) + else: + processed_data.append(item) + + return render_template('all.html', data=processed_data) # 删除数据路由 @app.route('/delete/', methods=['POST']) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..49fb356 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +flask==3.1.1 +pillow==11.1.0 +openai==1.88.0 +elasticsearch==7.17.0 +pandas==2.2.3 +requests \ No newline at end of file