hxf/backend/tests/pandas_test.py

126 lines
4.6 KiB
Python
Raw Normal View History

2025-12-04 14:48:38 +08:00
import os
import sys
import asyncio
import pandas as pd
import tempfile
import pickle
from datetime import datetime
from typing import Dict, Any, List
sys.path.insert(0,os.path.join(os.path.dirname(__file__),'..','','backend'))
def execute(df_1,df_2):
# 假设合同日期列是字符串类型,将其转换为日期类型
if '合同日期' in df_1.columns:
df_1['合同日期'] = pd.to_datetime(df_1['合同日期'])
if '合同日期' in df_2.columns:
df_2['合同日期'] = pd.to_datetime(df_2['合同日期'])
# 筛选出2024年和2025年的数据
filtered_df_1 = df_1[
(df_1['合同日期'].dt.year == 2024) | (df_1['合同日期'].dt.year == 2025)]
filtered_df_2 = df_2[
(df_2['合同日期'].dt.year == 2024) | (df_2['合同日期'].dt.year == 2025)]
# 合并两个数据框
combined_df = pd.concat([filtered_df_1[:5], filtered_df_2[:7]], ignore_index=True)
# 在去重前清理空值
# combined_df_clean = combined_df.dropna(subset=['项目号']) # 确保主键不为空
# 填充数值列的空值
combined_df_filled = combined_df.fillna({
'总合同额': 0,
'已确认比例': 0,
'分包合同额': 0
})
# 找出不同的项目
unique_projects = combined_df.drop_duplicates(subset=['项目号'])
return unique_projects
def test_load_selected_dataframes():
try:
file1_path = '2025年在手合同数据.xlsx.pkl'
file2_path = '2024年在手合同数据.xlsx.pkl'
target_filenames = [file1_path,file2_path]
dataframes = {}
base_dir = os.path.join("D:\workspace-py\chat-agent\\backend","data","uploads","excel_6")
all_files = os.listdir(base_dir)
for filename in target_filenames:
matching_files = []
for file in all_files:
if file.endswith(f"_{filename}") or file.endswith(f"_{filename}.pkl"):
matching_files.append(file)
if not matching_files:
print(f"未找到匹配的文件: {filename}")
# 如果有多个匹配文件,选择最新的
if len(matching_files) > 1:
matching_files.sort(key=lambda x: os.path.getmtime(os.path.join(base_dir, x)), reverse=True)
print(f"找到多个匹配文件,选择最新的: {matching_files[0]}")
continue
selected_file = matching_files[0]
file_path = os.path.join(base_dir, selected_file)
try:
# 优先加载pickle文件
if selected_file.endswith('.pkl'):
with open(file_path, 'rb') as f:
df = pickle.load(f)
print(f"成功从pickle加载文件: {selected_file}")
else:
# 如果没有pickle文件尝试加载原始文件
if selected_file.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_path)
elif selected_file.endswith('.csv'):
df = pd.read_csv(file_path)
else:
print(f"不支持的文件格式: {selected_file}")
continue
print(f"成功从原始文件加载: {selected_file}")
# 使用原始文件名作为key
dataframes[filename] = df
print(f"成功加载DataFrame: {filename}, 形状: {df.shape}")
except Exception as e:
print(f"加载文件失败 {selected_file}: {e}")
continue
return dataframes
except Exception as e:
print(e)
if __name__ == '__main__':
dataframes = test_load_selected_dataframes()
df_names = list(dataframes.keys())
if len(df_names) >= 2:
df_1 = dataframes[df_names[0]]
df_2 = dataframes[df_names[1]]
print(f"DataFrame 1 ({df_names[0]}) 形状: {df_1.shape}")
print(f"DataFrame 1 列名: {list(df_1.columns)}")
print(f"DataFrame 1 前几行:")
print(df_1.head())
print()
print(f"DataFrame 2 ({df_names[1]}) 形状: {df_2.shape}")
print(f"DataFrame 2 列名: {list(df_2.columns)}")
print(f"DataFrame 2 前几行:")
print(df_2.head())
print()
# 执行用户提供的数据处理逻辑
print("执行数据处理逻辑...")
result = execute(df_1, df_2)
print("处理结果:")
print(f"结果形状: {result.shape}")
print(f"结果列名: {list(result.columns)}")
print("结果数据:")
print(result)