修改数据结构 #1
							
								
								
									
										71
									
								
								ESConnect.py
									
									
									
									
									
								
							
							
						
						
									
										71
									
								
								ESConnect.py
									
									
									
									
									
								
							@@ -15,7 +15,7 @@ AUTH = None  # 如需认证则改为("用户名","密码")
 | 
			
		||||
es = Elasticsearch(["http://localhost:9200"])
 | 
			
		||||
 | 
			
		||||
# 定义索引名称和类型名称
 | 
			
		||||
index_name = "wordsearch2"
 | 
			
		||||
index_name = "wordsearch2666"
 | 
			
		||||
 | 
			
		||||
def create_index_with_mapping():
 | 
			
		||||
    """修正后的索引映射配置"""
 | 
			
		||||
@@ -23,22 +23,12 @@ def create_index_with_mapping():
 | 
			
		||||
    mapping = {
 | 
			
		||||
        "mappings": {
 | 
			
		||||
            "properties": {
 | 
			
		||||
                "id": {
 | 
			
		||||
                    "type": "text",  # 改为text类型支持分词
 | 
			
		||||
                "data": {
 | 
			
		||||
                    "type": "text",  # 存储转换后的字符串,支持分词搜索
 | 
			
		||||
                    "analyzer": "ik_max_word",
 | 
			
		||||
                    "search_analyzer": "ik_smart"
 | 
			
		||||
                },
 | 
			
		||||
                "name": {
 | 
			
		||||
                    "type": "text",
 | 
			
		||||
                    "analyzer": "ik_max_word",
 | 
			
		||||
                    "search_analyzer": "ik_smart"
 | 
			
		||||
                },
 | 
			
		||||
                "students": {"type": "keyword"},  # 仅保留type参数
 | 
			
		||||
                "teacher": {"type": "keyword"},    # 仅保留type参数
 | 
			
		||||
                "timestamp": {
 | 
			
		||||
                    "type": "date",
 | 
			
		||||
                    "format": "strict_date_optional_time||epoch_millis"
 | 
			
		||||
                }
 | 
			
		||||
                "image": {"type": "keyword"},  # 存储图片路径或标识
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
@@ -54,8 +44,7 @@ def create_index_with_mapping():
 | 
			
		||||
 | 
			
		||||
def get_doc_id(data):
 | 
			
		||||
    """
 | 
			
		||||
    根据关键字段生成唯一ID(用于去重)
 | 
			
		||||
    可以根据实际需求调整字段组合
 | 
			
		||||
    根据数据内容生成唯一ID(用于去重)
 | 
			
		||||
    
 | 
			
		||||
    参数:
 | 
			
		||||
        data (dict): 包含文档数据的字典
 | 
			
		||||
@@ -63,8 +52,10 @@ def get_doc_id(data):
 | 
			
		||||
    返回:
 | 
			
		||||
        str: 基于数据内容生成的MD5哈希值作为唯一ID
 | 
			
		||||
    """
 | 
			
		||||
    # 组合关键字段生成唯一字符串
 | 
			
		||||
    unique_str = f"{data['id']}{data['name']}{data['students']}{data['teacher']}"
 | 
			
		||||
    # 使用data字段的内容生成唯一字符串
 | 
			
		||||
    data_str = data.get('data', '')
 | 
			
		||||
    image_str = data.get('image', '')
 | 
			
		||||
    unique_str = f"{data_str}{image_str}"
 | 
			
		||||
    # 使用MD5哈希生成唯一ID
 | 
			
		||||
    return hashlib.md5(unique_str.encode('utf-8')).hexdigest()
 | 
			
		||||
 | 
			
		||||
@@ -184,47 +175,3 @@ def batch_write_data(data):
 | 
			
		||||
    except requests.exceptions.HTTPError as e:
 | 
			
		||||
        print(f"文档写入失败: {e.response.text}, 数据: {data}")
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
def update_mapping():
 | 
			
		||||
    # 定义新的映射配置
 | 
			
		||||
    new_mapping = {
 | 
			
		||||
        "properties": {
 | 
			
		||||
            "id": {
 | 
			
		||||
                "type": "text",
 | 
			
		||||
                "analyzer": "ik_max_word",
 | 
			
		||||
                "search_analyzer": "ik_smart"
 | 
			
		||||
            },
 | 
			
		||||
            "name": {
 | 
			
		||||
                "type": "text",
 | 
			
		||||
                "analyzer": "ik_max_word"
 | 
			
		||||
            },
 | 
			
		||||
            "students": {
 | 
			
		||||
                "type": "keyword"
 | 
			
		||||
            },
 | 
			
		||||
            "teacher": {
 | 
			
		||||
                "type": "keyword"
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # 执行PUT请求更新映射
 | 
			
		||||
    try:
 | 
			
		||||
        response = requests.put(
 | 
			
		||||
            f"{ES_URL}/{index_name}/_mapping",
 | 
			
		||||
            auth=AUTH,
 | 
			
		||||
            json=new_mapping,
 | 
			
		||||
            headers={"Content-Type": "application/json"}
 | 
			
		||||
        )
 | 
			
		||||
        response.raise_for_status()
 | 
			
		||||
        print("索引映射更新成功")
 | 
			
		||||
        print(response.json())
 | 
			
		||||
 | 
			
		||||
        # 验证映射更新结果
 | 
			
		||||
        verify = requests.get(
 | 
			
		||||
            f"{ES_URL}/{index_name}/_mapping",
 | 
			
		||||
            auth=AUTH
 | 
			
		||||
        )
 | 
			
		||||
        print("\n验证结果:")
 | 
			
		||||
        print(verify.json())
 | 
			
		||||
    except requests.exceptions.HTTPError as e:
 | 
			
		||||
        print(f"请求失败: {e.response.text}")
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										107
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										107
									
								
								app.py
									
									
									
									
									
								
							@@ -6,6 +6,7 @@ from PIL import Image
 | 
			
		||||
import re
 | 
			
		||||
import json
 | 
			
		||||
from ESConnect import *
 | 
			
		||||
from json_converter import json_to_string, string_to_json
 | 
			
		||||
from openai import OpenAI
 | 
			
		||||
# import config
 | 
			
		||||
 | 
			
		||||
@@ -52,9 +53,9 @@ def ocr_and_extract_info(image_path):
 | 
			
		||||
        messages=[
 | 
			
		||||
            {'role': 'system', 'content': '你是一个能理解图片和文本的助手,请根据用户提供的信息进行回答。'},
 | 
			
		||||
            {'role': 'user', "content": [
 | 
			
		||||
                {"type": "text", "text": "请识别这张图片中的信息,只显示json不显示其它信息便于解析"
 | 
			
		||||
                                         "以JSON格式返回(id对应比赛名称或论文名称,name对应项目名称,students对应参赛学生,teacher对应指导老师,出现多个名字用列表存储)"
 | 
			
		||||
                                         ":{'id':'', 'name':'','students':'','teacher':''}"},
 | 
			
		||||
                {"type": "text", "text": "请识别这张图片中的信息,将你认为重要的数据转换为不包含嵌套的json,不要显示其它信息以便于解析"
 | 
			
		||||
                                         "直接输出json结果即可"
 | 
			
		||||
                                         "你可以自行决定使用哪些json字段"},
 | 
			
		||||
                {
 | 
			
		||||
                    "type": "image_url",
 | 
			
		||||
                    "image_url": {
 | 
			
		||||
@@ -68,6 +69,12 @@ def ocr_and_extract_info(image_path):
 | 
			
		||||
 | 
			
		||||
    # 获取API返回的文本内容
 | 
			
		||||
    response_text = chat_completion.choices[0].message.content
 | 
			
		||||
    
 | 
			
		||||
    # 添加调试信息:输出模型返回的原始字符串
 | 
			
		||||
    print("=" * 50)
 | 
			
		||||
    print("模型返回的原始字符串:")
 | 
			
		||||
    print(response_text)
 | 
			
		||||
    print("=" * 50)
 | 
			
		||||
 | 
			
		||||
    def parse_respound(text):
 | 
			
		||||
        """
 | 
			
		||||
@@ -83,10 +90,10 @@ def ocr_and_extract_info(image_path):
 | 
			
		||||
        try:
 | 
			
		||||
            result=json.loads(text)
 | 
			
		||||
            if result:
 | 
			
		||||
                print("success")
 | 
			
		||||
                print("✓ 成功解析标准JSON格式")
 | 
			
		||||
                return result
 | 
			
		||||
        except json.JSONDecodeError:
 | 
			
		||||
            print("无法解析标准json")
 | 
			
		||||
            print("✗ 无法解析标准JSON格式")
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        # 提取markdown代码块中的内容
 | 
			
		||||
@@ -95,10 +102,10 @@ def ocr_and_extract_info(image_path):
 | 
			
		||||
            try:
 | 
			
		||||
                result=json.loads(code_block.group(1))
 | 
			
		||||
                if result:
 | 
			
		||||
                    print("success")
 | 
			
		||||
                    print("✓ 成功解析markdown代码块中的JSON")
 | 
			
		||||
                    return result
 | 
			
		||||
            except json.JSONDecodeError:
 | 
			
		||||
                print("无法解析markdown")
 | 
			
		||||
                print("✗ 无法解析markdown代码块中的JSON")
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
        # 尝试替换单引号并解析
 | 
			
		||||
@@ -106,14 +113,23 @@ def ocr_and_extract_info(image_path):
 | 
			
		||||
            fixed_json = text.replace("'", "\"")
 | 
			
		||||
            result=json.loads(fixed_json)
 | 
			
		||||
            if(result):
 | 
			
		||||
                print("success")
 | 
			
		||||
                print("✓ 成功解析替换单引号后的JSON")
 | 
			
		||||
                return result
 | 
			
		||||
        except json.JSONDecodeError:
 | 
			
		||||
            print("无法替换单引号")
 | 
			
		||||
            print("✗ 无法解析替换单引号后的JSON")
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
    # 解析API返回的文本
 | 
			
		||||
    result_data = parse_respound(response_text)
 | 
			
		||||
    
 | 
			
		||||
    # 添加调试信息:输出解析结果
 | 
			
		||||
    print("解析结果:")
 | 
			
		||||
    if result_data:
 | 
			
		||||
        print(f"✓ 解析成功: {result_data}")
 | 
			
		||||
    else:
 | 
			
		||||
        print("✗ 解析失败,返回None")
 | 
			
		||||
    print("=" * 50)
 | 
			
		||||
    
 | 
			
		||||
    return result_data
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
@@ -154,10 +170,28 @@ def upload_image():
 | 
			
		||||
 | 
			
		||||
    # 调用大模型进行识别
 | 
			
		||||
    try:
 | 
			
		||||
        data = ocr_and_extract_info(image_path)  # 替换为真实AI接口调用
 | 
			
		||||
        insert_data(data)  # 存入ES
 | 
			
		||||
        return jsonify({"message": "成功录入", "data": data})
 | 
			
		||||
        print(f"开始处理图片: {image_path}")
 | 
			
		||||
        original_data = ocr_and_extract_info(image_path)  # 获取原始JSON数据
 | 
			
		||||
        if original_data:
 | 
			
		||||
            # 使用json_converter将JSON数据转换为字符串
 | 
			
		||||
            data_string = json_to_string(original_data)
 | 
			
		||||
            print(f"转换后的数据字符串: {data_string}")
 | 
			
		||||
            
 | 
			
		||||
            # 构造新的数据结构,只包含data和image字段
 | 
			
		||||
            processed_data = {
 | 
			
		||||
                "data": data_string,
 | 
			
		||||
                "image": filename  # 存储图片文件名
 | 
			
		||||
            }
 | 
			
		||||
            print(f"准备存储的数据: {processed_data}")
 | 
			
		||||
            
 | 
			
		||||
            insert_data(processed_data)  # 存入ES
 | 
			
		||||
            print("✓ 数据成功存储到Elasticsearch")
 | 
			
		||||
            return jsonify({"message": "成功录入", "data": original_data, "processed": processed_data})
 | 
			
		||||
        else:
 | 
			
		||||
            print("✗ 无法识别图片内容")
 | 
			
		||||
            return jsonify({"error": "无法识别图片内容"}), 400
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"✗ 处理过程中发生错误: {str(e)}")
 | 
			
		||||
        return jsonify({"error": str(e)}), 500
 | 
			
		||||
 | 
			
		||||
# 搜索路由
 | 
			
		||||
@@ -173,8 +207,31 @@ def search():
 | 
			
		||||
    if not keyword:
 | 
			
		||||
        return jsonify([])
 | 
			
		||||
    results = search_by_any_field(keyword)
 | 
			
		||||
    print(results)
 | 
			
		||||
    return jsonify(results)
 | 
			
		||||
    
 | 
			
		||||
    # 处理搜索结果,将data字段转换回JSON格式
 | 
			
		||||
    processed_results = []
 | 
			
		||||
    for result in results:
 | 
			
		||||
        if '_source' in result and 'data' in result['_source']:
 | 
			
		||||
            try:
 | 
			
		||||
                # 将data字段的字符串转换回JSON
 | 
			
		||||
                original_data = string_to_json(result['_source']['data'])
 | 
			
		||||
                # 构造新的结果格式
 | 
			
		||||
                processed_result = {
 | 
			
		||||
                    '_id': result.get('_id', ''),
 | 
			
		||||
                    '_source': {
 | 
			
		||||
                        'image': result['_source'].get('image', ''),
 | 
			
		||||
                        **original_data  # 展开原始数据字段
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                processed_results.append(processed_result)
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # 如果转换失败,保持原始格式
 | 
			
		||||
                processed_results.append(result)
 | 
			
		||||
        else:
 | 
			
		||||
            processed_results.append(result)
 | 
			
		||||
    
 | 
			
		||||
    print(processed_results)
 | 
			
		||||
    return jsonify(processed_results)
 | 
			
		||||
 | 
			
		||||
# 结果页面路由
 | 
			
		||||
@app.route('/results')
 | 
			
		||||
@@ -197,7 +254,27 @@ def show_all():
 | 
			
		||||
        str: 渲染后的HTML页面,包含所有数据
 | 
			
		||||
    """
 | 
			
		||||
    all_data = search_all()
 | 
			
		||||
    return render_template('all.html', data=all_data)
 | 
			
		||||
    # 将data字段从字符串转换回JSON格式以便显示
 | 
			
		||||
    processed_data = []
 | 
			
		||||
    for item in all_data:
 | 
			
		||||
        if 'data' in item and item['data']:
 | 
			
		||||
            try:
 | 
			
		||||
                # 将data字段的字符串转换回JSON
 | 
			
		||||
                original_data = string_to_json(item['data'])
 | 
			
		||||
                # 合并原始数据和其他字段
 | 
			
		||||
                display_item = {
 | 
			
		||||
                    '_id': item['_id'],
 | 
			
		||||
                    'image': item.get('image', ''),
 | 
			
		||||
                    **original_data  # 展开原始数据字段
 | 
			
		||||
                }
 | 
			
		||||
                processed_data.append(display_item)
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # 如果转换失败,保持原始格式
 | 
			
		||||
                processed_data.append(item)
 | 
			
		||||
        else:
 | 
			
		||||
            processed_data.append(item)
 | 
			
		||||
    
 | 
			
		||||
    return render_template('all.html', data=processed_data)
 | 
			
		||||
 | 
			
		||||
# 删除数据路由
 | 
			
		||||
@app.route('/delete/<doc_id>', methods=['POST'])
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
			
		||||
flask==3.1.1
 | 
			
		||||
pillow==11.1.0
 | 
			
		||||
openai==1.88.0
 | 
			
		||||
elasticsearch==7.17.0
 | 
			
		||||
pandas==2.2.3
 | 
			
		||||
requests
 | 
			
		||||
		Reference in New Issue
	
	Block a user