hxf/backend/tests/pandas_test.py

126 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import asyncio
import pandas as pd
import tempfile
import pickle
from datetime import datetime
from typing import Dict, Any, List
sys.path.insert(0,os.path.join(os.path.dirname(__file__),'..','','backend'))
def execute(df_1,df_2):
# 假设合同日期列是字符串类型,将其转换为日期类型
if '合同日期' in df_1.columns:
df_1['合同日期'] = pd.to_datetime(df_1['合同日期'])
if '合同日期' in df_2.columns:
df_2['合同日期'] = pd.to_datetime(df_2['合同日期'])
# 筛选出2024年和2025年的数据
filtered_df_1 = df_1[
(df_1['合同日期'].dt.year == 2024) | (df_1['合同日期'].dt.year == 2025)]
filtered_df_2 = df_2[
(df_2['合同日期'].dt.year == 2024) | (df_2['合同日期'].dt.year == 2025)]
# 合并两个数据框
combined_df = pd.concat([filtered_df_1[:5], filtered_df_2[:7]], ignore_index=True)
# 在去重前清理空值
# combined_df_clean = combined_df.dropna(subset=['项目号']) # 确保主键不为空
# 填充数值列的空值
combined_df_filled = combined_df.fillna({
'总合同额': 0,
'已确认比例': 0,
'分包合同额': 0
})
# 找出不同的项目
unique_projects = combined_df.drop_duplicates(subset=['项目号'])
return unique_projects
def test_load_selected_dataframes():
try:
file1_path = '2025年在手合同数据.xlsx.pkl'
file2_path = '2024年在手合同数据.xlsx.pkl'
target_filenames = [file1_path,file2_path]
dataframes = {}
base_dir = os.path.join("D:\workspace-py\chat-agent\\backend","data","uploads","excel_6")
all_files = os.listdir(base_dir)
for filename in target_filenames:
matching_files = []
for file in all_files:
if file.endswith(f"_{filename}") or file.endswith(f"_{filename}.pkl"):
matching_files.append(file)
if not matching_files:
print(f"未找到匹配的文件: {filename}")
# 如果有多个匹配文件,选择最新的
if len(matching_files) > 1:
matching_files.sort(key=lambda x: os.path.getmtime(os.path.join(base_dir, x)), reverse=True)
print(f"找到多个匹配文件,选择最新的: {matching_files[0]}")
continue
selected_file = matching_files[0]
file_path = os.path.join(base_dir, selected_file)
try:
# 优先加载pickle文件
if selected_file.endswith('.pkl'):
with open(file_path, 'rb') as f:
df = pickle.load(f)
print(f"成功从pickle加载文件: {selected_file}")
else:
# 如果没有pickle文件尝试加载原始文件
if selected_file.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_path)
elif selected_file.endswith('.csv'):
df = pd.read_csv(file_path)
else:
print(f"不支持的文件格式: {selected_file}")
continue
print(f"成功从原始文件加载: {selected_file}")
# 使用原始文件名作为key
dataframes[filename] = df
print(f"成功加载DataFrame: {filename}, 形状: {df.shape}")
except Exception as e:
print(f"加载文件失败 {selected_file}: {e}")
continue
return dataframes
except Exception as e:
print(e)
if __name__ == '__main__':
dataframes = test_load_selected_dataframes()
df_names = list(dataframes.keys())
if len(df_names) >= 2:
df_1 = dataframes[df_names[0]]
df_2 = dataframes[df_names[1]]
print(f"DataFrame 1 ({df_names[0]}) 形状: {df_1.shape}")
print(f"DataFrame 1 列名: {list(df_1.columns)}")
print(f"DataFrame 1 前几行:")
print(df_1.head())
print()
print(f"DataFrame 2 ({df_names[1]}) 形状: {df_2.shape}")
print(f"DataFrame 2 列名: {list(df_2.columns)}")
print(f"DataFrame 2 前几行:")
print(df_2.head())
print()
# 执行用户提供的数据处理逻辑
print("执行数据处理逻辑...")
result = execute(df_1, df_2)
print("处理结果:")
print(f"结果形状: {result.shape}")
print(f"结果列名: {list(result.columns)}")
print("结果数据:")
print(result)