126 lines
4.6 KiB
Python
126 lines
4.6 KiB
Python
import os
|
||
import sys
|
||
import asyncio
|
||
import pandas as pd
|
||
import tempfile
|
||
import pickle
|
||
from datetime import datetime
|
||
from typing import Dict, Any, List
|
||
|
||
sys.path.insert(0,os.path.join(os.path.dirname(__file__),'..','','backend'))
|
||
|
||
|
||
def execute(df_1,df_2):
|
||
# 假设合同日期列是字符串类型,将其转换为日期类型
|
||
if '合同日期' in df_1.columns:
|
||
df_1['合同日期'] = pd.to_datetime(df_1['合同日期'])
|
||
if '合同日期' in df_2.columns:
|
||
df_2['合同日期'] = pd.to_datetime(df_2['合同日期'])
|
||
|
||
# 筛选出2024年和2025年的数据
|
||
filtered_df_1 = df_1[
|
||
(df_1['合同日期'].dt.year == 2024) | (df_1['合同日期'].dt.year == 2025)]
|
||
filtered_df_2 = df_2[
|
||
(df_2['合同日期'].dt.year == 2024) | (df_2['合同日期'].dt.year == 2025)]
|
||
# 合并两个数据框
|
||
combined_df = pd.concat([filtered_df_1[:5], filtered_df_2[:7]], ignore_index=True)
|
||
# 在去重前清理空值
|
||
# combined_df_clean = combined_df.dropna(subset=['项目号']) # 确保主键不为空
|
||
|
||
# 填充数值列的空值
|
||
combined_df_filled = combined_df.fillna({
|
||
'总合同额': 0,
|
||
'已确认比例': 0,
|
||
'分包合同额': 0
|
||
})
|
||
# 找出不同的项目
|
||
unique_projects = combined_df.drop_duplicates(subset=['项目号'])
|
||
return unique_projects
|
||
|
||
|
||
|
||
def test_load_selected_dataframes():
|
||
|
||
try:
|
||
file1_path = '2025年在手合同数据.xlsx.pkl'
|
||
file2_path = '2024年在手合同数据.xlsx.pkl'
|
||
target_filenames = [file1_path,file2_path]
|
||
dataframes = {}
|
||
base_dir = os.path.join("D:\workspace-py\chat-agent\\backend","data","uploads","excel_6")
|
||
|
||
all_files = os.listdir(base_dir)
|
||
for filename in target_filenames:
|
||
matching_files = []
|
||
for file in all_files:
|
||
if file.endswith(f"_{filename}") or file.endswith(f"_{filename}.pkl"):
|
||
matching_files.append(file)
|
||
if not matching_files:
|
||
print(f"未找到匹配的文件: {filename}")
|
||
|
||
# 如果有多个匹配文件,选择最新的
|
||
if len(matching_files) > 1:
|
||
matching_files.sort(key=lambda x: os.path.getmtime(os.path.join(base_dir, x)), reverse=True)
|
||
print(f"找到多个匹配文件,选择最新的: {matching_files[0]}")
|
||
continue
|
||
|
||
selected_file = matching_files[0]
|
||
file_path = os.path.join(base_dir, selected_file)
|
||
|
||
try:
|
||
# 优先加载pickle文件
|
||
if selected_file.endswith('.pkl'):
|
||
with open(file_path, 'rb') as f:
|
||
df = pickle.load(f)
|
||
print(f"成功从pickle加载文件: {selected_file}")
|
||
else:
|
||
# 如果没有pickle文件,尝试加载原始文件
|
||
if selected_file.endswith(('.xlsx', '.xls')):
|
||
df = pd.read_excel(file_path)
|
||
elif selected_file.endswith('.csv'):
|
||
df = pd.read_csv(file_path)
|
||
else:
|
||
print(f"不支持的文件格式: {selected_file}")
|
||
continue
|
||
print(f"成功从原始文件加载: {selected_file}")
|
||
|
||
# 使用原始文件名作为key
|
||
dataframes[filename] = df
|
||
print(f"成功加载DataFrame: {filename}, 形状: {df.shape}")
|
||
|
||
except Exception as e:
|
||
print(f"加载文件失败 {selected_file}: {e}")
|
||
continue
|
||
|
||
return dataframes
|
||
except Exception as e:
|
||
print(e)
|
||
|
||
if __name__ == '__main__':
|
||
dataframes = test_load_selected_dataframes()
|
||
df_names = list(dataframes.keys())
|
||
if len(df_names) >= 2:
|
||
df_1 = dataframes[df_names[0]]
|
||
df_2 = dataframes[df_names[1]]
|
||
|
||
print(f"DataFrame 1 ({df_names[0]}) 形状: {df_1.shape}")
|
||
print(f"DataFrame 1 列名: {list(df_1.columns)}")
|
||
print(f"DataFrame 1 前几行:")
|
||
print(df_1.head())
|
||
print()
|
||
|
||
print(f"DataFrame 2 ({df_names[1]}) 形状: {df_2.shape}")
|
||
print(f"DataFrame 2 列名: {list(df_2.columns)}")
|
||
print(f"DataFrame 2 前几行:")
|
||
print(df_2.head())
|
||
print()
|
||
|
||
# 执行用户提供的数据处理逻辑
|
||
print("执行数据处理逻辑...")
|
||
result = execute(df_1, df_2)
|
||
|
||
print("处理结果:")
|
||
print(f"结果形状: {result.shape}")
|
||
print(f"结果列名: {list(result.columns)}")
|
||
print("结果数据:")
|
||
print(result)
|