From 657365f9de82aa9c6609e98de3e4ee42f7987dc8 Mon Sep 17 00:00:00 2001
From: spdis <q17721073823@outlook.com>
Date: Sun, 28 Sep 2025 21:45:02 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=95=B0=E6=8D=AE=E7=BB=93?=
 =?UTF-8?q?=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ESConnect.py      |  71 ++++--------------------------
 app.py            | 107 +++++++++++++++++++++++++++++++++++++++-------
 json_converter.py | 100 +++++++++++++++++++++++++++++++++++++++++++
 requirements.txt  |  11 ++---
 4 files changed, 207 insertions(+), 82 deletions(-)
 create mode 100644 json_converter.py

diff --git a/ESConnect.py b/ESConnect.py
index 9833291..3c93b03 100644
--- a/ESConnect.py
+++ b/ESConnect.py
@@ -15,7 +15,7 @@ AUTH = None  # 如需认证则改为("用户名","密码")
 es = Elasticsearch(["http://localhost:9200"])
 
 # 定义索引名称和类型名称
-index_name = "wordsearch2"
+index_name = "wordsearch2666"
 
 def create_index_with_mapping():
     """修正后的索引映射配置"""
@@ -23,22 +23,12 @@ def create_index_with_mapping():
     mapping = {
         "mappings": {
             "properties": {
-                "id": {
-                    "type": "text",  # 改为text类型支持分词
+                "data": {
+                    "type": "text",  # 存储转换后的字符串，支持分词搜索
                     "analyzer": "ik_max_word",
                     "search_analyzer": "ik_smart"
                 },
-                "name": {
-                    "type": "text",
-                    "analyzer": "ik_max_word",
-                    "search_analyzer": "ik_smart"
-                },
-                "students": {"type": "keyword"},  # 仅保留type参数
-                "teacher": {"type": "keyword"},    # 仅保留type参数
-                "timestamp": {
-                    "type": "date",
-                    "format": "strict_date_optional_time||epoch_millis"
-                }
+                "image": {"type": "keyword"},  # 存储图片路径或标识
             }
         }
     }
@@ -54,8 +44,7 @@ def create_index_with_mapping():
 
 def get_doc_id(data):
     """
-    根据关键字段生成唯一ID（用于去重）
-    可以根据实际需求调整字段组合
+    根据数据内容生成唯一ID（用于去重）
     
     参数:
         data (dict): 包含文档数据的字典
@@ -63,8 +52,10 @@ def get_doc_id(data):
     返回:
         str: 基于数据内容生成的MD5哈希值作为唯一ID
     """
-    # 组合关键字段生成唯一字符串
-    unique_str = f"{data['id']}{data['name']}{data['students']}{data['teacher']}"
+    # 使用data字段的内容生成唯一字符串
+    data_str = data.get('data', '')
+    image_str = data.get('image', '')
+    unique_str = f"{data_str}{image_str}"
     # 使用MD5哈希生成唯一ID
     return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
 
@@ -184,47 +175,3 @@ def batch_write_data(data):
     except requests.exceptions.HTTPError as e:
         print(f"文档写入失败: {e.response.text}, 数据: {data}")
         return False
-
-def update_mapping():
-    # 定义新的映射配置
-    new_mapping = {
-        "properties": {
-            "id": {
-                "type": "text",
-                "analyzer": "ik_max_word",
-                "search_analyzer": "ik_smart"
-            },
-            "name": {
-                "type": "text",
-                "analyzer": "ik_max_word"
-            },
-            "students": {
-                "type": "keyword"
-            },
-            "teacher": {
-                "type": "keyword"
-            }
-        }
-    }
-
-    # 执行PUT请求更新映射
-    try:
-        response = requests.put(
-            f"{ES_URL}/{index_name}/_mapping",
-            auth=AUTH,
-            json=new_mapping,
-            headers={"Content-Type": "application/json"}
-        )
-        response.raise_for_status()
-        print("索引映射更新成功")
-        print(response.json())
-
-        # 验证映射更新结果
-        verify = requests.get(
-            f"{ES_URL}/{index_name}/_mapping",
-            auth=AUTH
-        )
-        print("\n验证结果：")
-        print(verify.json())
-    except requests.exceptions.HTTPError as e:
-        print(f"请求失败: {e.response.text}")
diff --git a/app.py b/app.py
index b84c62b..2d5b7f7 100644
--- a/app.py
+++ b/app.py
@@ -6,6 +6,7 @@ from PIL import Image
 import re
 import json
 from ESConnect import *
+from json_converter import json_to_string, string_to_json
 from openai import OpenAI
 # import config
 
@@ -52,9 +53,9 @@ def ocr_and_extract_info(image_path):
         messages=[
             {'role': 'system', 'content': '你是一个能理解图片和文本的助手，请根据用户提供的信息进行回答。'},
             {'role': 'user', "content": [
-                {"type": "text", "text": "请识别这张图片中的信息，只显示json不显示其它信息便于解析"
-                                         "以JSON格式返回（id对应比赛名称或论文名称，name对应项目名称，students对应参赛学生，teacher对应指导老师,出现多个名字用列表存储）"
-                                         "：{'id':'', 'name':'','students':'','teacher':''}"},
+                {"type": "text", "text": "请识别这张图片中的信息，将你认为重要的数据转换为不包含嵌套的json，不要显示其它信息以便于解析"
+                                         "直接输出json结果即可"
+                                         "你可以自行决定使用哪些json字段"},
                 {
                     "type": "image_url",
                     "image_url": {
@@ -68,6 +69,12 @@ def ocr_and_extract_info(image_path):
 
     # 获取API返回的文本内容
     response_text = chat_completion.choices[0].message.content
+    
+    # 添加调试信息：输出模型返回的原始字符串
+    print("=" * 50)
+    print("模型返回的原始字符串:")
+    print(response_text)
+    print("=" * 50)
 
     def parse_respound(text):
         """
@@ -83,10 +90,10 @@ def ocr_and_extract_info(image_path):
         try:
             result=json.loads(text)
             if result:
-                print("success")
+                print("✓ 成功解析标准JSON格式")
                 return result
         except json.JSONDecodeError:
-            print("无法解析标准json")
+            print("✗ 无法解析标准JSON格式")
             pass
 
         # 提取markdown代码块中的内容
@@ -95,10 +102,10 @@ def ocr_and_extract_info(image_path):
             try:
                 result=json.loads(code_block.group(1))
                 if result:
-                    print("success")
+                    print("✓ 成功解析markdown代码块中的JSON")
                     return result
             except json.JSONDecodeError:
-                print("无法解析markdown")
+                print("✗ 无法解析markdown代码块中的JSON")
                 pass
 
         # 尝试替换单引号并解析
@@ -106,14 +113,23 @@ def ocr_and_extract_info(image_path):
             fixed_json = text.replace("'", "\"")
             result=json.loads(fixed_json)
             if(result):
-                print("success")
+                print("✓ 成功解析替换单引号后的JSON")
                 return result
         except json.JSONDecodeError:
-            print("无法替换单引号")
+            print("✗ 无法解析替换单引号后的JSON")
             pass
 
     # 解析API返回的文本
     result_data = parse_respound(response_text)
+    
+    # 添加调试信息：输出解析结果
+    print("解析结果:")
+    if result_data:
+        print(f"✓ 解析成功: {result_data}")
+    else:
+        print("✗ 解析失败，返回None")
+    print("=" * 50)
+    
     return result_data
 
     """
@@ -154,10 +170,28 @@ def upload_image():
 
     # 调用大模型进行识别
     try:
-        data = ocr_and_extract_info(image_path)  # 替换为真实AI接口调用
-        insert_data(data)  # 存入ES
-        return jsonify({"message": "成功录入", "data": data})
+        print(f"开始处理图片: {image_path}")
+        original_data = ocr_and_extract_info(image_path)  # 获取原始JSON数据
+        if original_data:
+            # 使用json_converter将JSON数据转换为字符串
+            data_string = json_to_string(original_data)
+            print(f"转换后的数据字符串: {data_string}")
+            
+            # 构造新的数据结构，只包含data和image字段
+            processed_data = {
+                "data": data_string,
+                "image": filename  # 存储图片文件名
+            }
+            print(f"准备存储的数据: {processed_data}")
+            
+            insert_data(processed_data)  # 存入ES
+            print("✓ 数据成功存储到Elasticsearch")
+            return jsonify({"message": "成功录入", "data": original_data, "processed": processed_data})
+        else:
+            print("✗ 无法识别图片内容")
+            return jsonify({"error": "无法识别图片内容"}), 400
     except Exception as e:
+        print(f"✗ 处理过程中发生错误: {str(e)}")
         return jsonify({"error": str(e)}), 500
 
 # 搜索路由
@@ -173,8 +207,31 @@ def search():
     if not keyword:
         return jsonify([])
     results = search_by_any_field(keyword)
-    print(results)
-    return jsonify(results)
+    
+    # 处理搜索结果，将data字段转换回JSON格式
+    processed_results = []
+    for result in results:
+        if '_source' in result and 'data' in result['_source']:
+            try:
+                # 将data字段的字符串转换回JSON
+                original_data = string_to_json(result['_source']['data'])
+                # 构造新的结果格式
+                processed_result = {
+                    '_id': result.get('_id', ''),
+                    '_source': {
+                        'image': result['_source'].get('image', ''),
+                        **original_data  # 展开原始数据字段
+                    }
+                }
+                processed_results.append(processed_result)
+            except Exception as e:
+                # 如果转换失败，保持原始格式
+                processed_results.append(result)
+        else:
+            processed_results.append(result)
+    
+    print(processed_results)
+    return jsonify(processed_results)
 
 # 结果页面路由
 @app.route('/results')
@@ -197,7 +254,27 @@ def show_all():
         str: 渲染后的HTML页面，包含所有数据
     """
     all_data = search_all()
-    return render_template('all.html', data=all_data)
+    # 将data字段从字符串转换回JSON格式以便显示
+    processed_data = []
+    for item in all_data:
+        if 'data' in item and item['data']:
+            try:
+                # 将data字段的字符串转换回JSON
+                original_data = string_to_json(item['data'])
+                # 合并原始数据和其他字段
+                display_item = {
+                    '_id': item['_id'],
+                    'image': item.get('image', ''),
+                    **original_data  # 展开原始数据字段
+                }
+                processed_data.append(display_item)
+            except Exception as e:
+                # 如果转换失败，保持原始格式
+                processed_data.append(item)
+        else:
+            processed_data.append(item)
+    
+    return render_template('all.html', data=processed_data)
 
 # 删除数据路由
 @app.route('/delete/<doc_id>', methods=['POST'])
diff --git a/json_converter.py b/json_converter.py
new file mode 100644
index 0000000..9f7e7ff
--- /dev/null
+++ b/json_converter.py
@@ -0,0 +1,100 @@
+import json
+
+
+def json_to_string(json_data):
+    """
+    将JSON数据转换为使用指定分隔符的字符串
+    使用 |###| 作为键值对分隔符
+    使用 |##| 作为列表元素分隔符
+    
+    Args:
+        json_data (dict): 要转换的JSON数据
+        
+    Returns:
+        str: 转换后的字符串
+    """
+    if not isinstance(json_data, dict):
+        raise ValueError("输入必须是字典类型")
+    
+    result_parts = []
+    
+    for key, value in json_data.items():
+        if isinstance(value, list):
+            # 处理列表：使用 |##| 分隔列表元素
+            list_str = "|##|".join(str(item) for item in value)
+            result_parts.append(f"{key}:[{list_str}]")
+        else:
+            # 处理普通值
+            result_parts.append(f"{key}:{value}")
+    
+    # 使用 |###| 分隔键值对
+    return "|###|".join(result_parts)
+
+
+def string_to_json(data_string):
+    """
+    将使用指定分隔符的字符串转换回JSON格式
+    解析使用 |###| 分隔的键值对
+    解析使用 |##| 分隔的列表元素
+    
+    Args:
+        data_string (str): 要转换的字符串
+        
+    Returns:
+        dict: 转换后的JSON数据
+    """
+    if not isinstance(data_string, str):
+        raise ValueError("输入必须是字符串类型")
+    
+    if not data_string.strip():
+        return {}
+    
+    result = {}
+    
+    # 使用 |###| 分割键值对
+    pairs = data_string.split("|###|")
+    
+    for pair in pairs:
+        if ":" not in pair:
+            continue
+            
+        # 分割键和值
+        key, value = pair.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        
+        # 检查是否是列表格式 [...]
+        if value.startswith("[") and value.endswith("]"):
+            # 处理列表
+            list_content = value[1:-1]  # 去掉方括号
+            if list_content:
+                # 使用 |##| 分割列表元素
+                items = list_content.split("|##|")
+                # 尝试转换为适当的数据类型
+                converted_items = []
+                for item in items:
+                    item = item.strip()
+                    # 尝试转换为数字
+                    try:
+                        if "." in item:
+                            converted_items.append(float(item))
+                        else:
+                            converted_items.append(int(item))
+                    except ValueError:
+                        # 如果不是数字，保持为字符串
+                        converted_items.append(item)
+                result[key] = converted_items
+            else:
+                result[key] = []
+        else:
+            # 处理普通值，尝试转换为适当的数据类型
+            try:
+                if "." in value:
+                    result[key] = float(value)
+                else:
+                    result[key] = int(value)
+            except ValueError:
+                # 如果不是数字，保持为字符串
+                result[key] = value
+    
+    return result
diff --git a/requirements.txt b/requirements.txt
index 943f585..803c1b9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
-flask==3.1.1
-pillow==11.1.0
-openai==1.88.0
-elasticsearch==7.17.0
-pandas==2.2.3
\ No newline at end of file
+flask==3.1.1
+pillow==11.1.0
+openai==1.88.0
+elasticsearch==7.17.0
+pandas==2.2.3
+requests
\ No newline at end of file