126 lines
4.6 KiB
Python
126 lines
4.6 KiB
Python
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import asyncio
|
|||
|
|
import pandas as pd
|
|||
|
|
import tempfile
|
|||
|
|
import pickle
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import Dict, Any, List
|
|||
|
|
|
|||
|
|
sys.path.insert(0,os.path.join(os.path.dirname(__file__),'..','','backend'))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def execute(df_1,df_2):
|
|||
|
|
# 假设合同日期列是字符串类型,将其转换为日期类型
|
|||
|
|
if '合同日期' in df_1.columns:
|
|||
|
|
df_1['合同日期'] = pd.to_datetime(df_1['合同日期'])
|
|||
|
|
if '合同日期' in df_2.columns:
|
|||
|
|
df_2['合同日期'] = pd.to_datetime(df_2['合同日期'])
|
|||
|
|
|
|||
|
|
# 筛选出2024年和2025年的数据
|
|||
|
|
filtered_df_1 = df_1[
|
|||
|
|
(df_1['合同日期'].dt.year == 2024) | (df_1['合同日期'].dt.year == 2025)]
|
|||
|
|
filtered_df_2 = df_2[
|
|||
|
|
(df_2['合同日期'].dt.year == 2024) | (df_2['合同日期'].dt.year == 2025)]
|
|||
|
|
# 合并两个数据框
|
|||
|
|
combined_df = pd.concat([filtered_df_1[:5], filtered_df_2[:7]], ignore_index=True)
|
|||
|
|
# 在去重前清理空值
|
|||
|
|
# combined_df_clean = combined_df.dropna(subset=['项目号']) # 确保主键不为空
|
|||
|
|
|
|||
|
|
# 填充数值列的空值
|
|||
|
|
combined_df_filled = combined_df.fillna({
|
|||
|
|
'总合同额': 0,
|
|||
|
|
'已确认比例': 0,
|
|||
|
|
'分包合同额': 0
|
|||
|
|
})
|
|||
|
|
# 找出不同的项目
|
|||
|
|
unique_projects = combined_df.drop_duplicates(subset=['项目号'])
|
|||
|
|
return unique_projects
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_load_selected_dataframes():
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
file1_path = '2025年在手合同数据.xlsx.pkl'
|
|||
|
|
file2_path = '2024年在手合同数据.xlsx.pkl'
|
|||
|
|
target_filenames = [file1_path,file2_path]
|
|||
|
|
dataframes = {}
|
|||
|
|
base_dir = os.path.join("D:\workspace-py\chat-agent\\backend","data","uploads","excel_6")
|
|||
|
|
|
|||
|
|
all_files = os.listdir(base_dir)
|
|||
|
|
for filename in target_filenames:
|
|||
|
|
matching_files = []
|
|||
|
|
for file in all_files:
|
|||
|
|
if file.endswith(f"_{filename}") or file.endswith(f"_{filename}.pkl"):
|
|||
|
|
matching_files.append(file)
|
|||
|
|
if not matching_files:
|
|||
|
|
print(f"未找到匹配的文件: {filename}")
|
|||
|
|
|
|||
|
|
# 如果有多个匹配文件,选择最新的
|
|||
|
|
if len(matching_files) > 1:
|
|||
|
|
matching_files.sort(key=lambda x: os.path.getmtime(os.path.join(base_dir, x)), reverse=True)
|
|||
|
|
print(f"找到多个匹配文件,选择最新的: {matching_files[0]}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
selected_file = matching_files[0]
|
|||
|
|
file_path = os.path.join(base_dir, selected_file)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 优先加载pickle文件
|
|||
|
|
if selected_file.endswith('.pkl'):
|
|||
|
|
with open(file_path, 'rb') as f:
|
|||
|
|
df = pickle.load(f)
|
|||
|
|
print(f"成功从pickle加载文件: {selected_file}")
|
|||
|
|
else:
|
|||
|
|
# 如果没有pickle文件,尝试加载原始文件
|
|||
|
|
if selected_file.endswith(('.xlsx', '.xls')):
|
|||
|
|
df = pd.read_excel(file_path)
|
|||
|
|
elif selected_file.endswith('.csv'):
|
|||
|
|
df = pd.read_csv(file_path)
|
|||
|
|
else:
|
|||
|
|
print(f"不支持的文件格式: {selected_file}")
|
|||
|
|
continue
|
|||
|
|
print(f"成功从原始文件加载: {selected_file}")
|
|||
|
|
|
|||
|
|
# 使用原始文件名作为key
|
|||
|
|
dataframes[filename] = df
|
|||
|
|
print(f"成功加载DataFrame: {filename}, 形状: {df.shape}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"加载文件失败 {selected_file}: {e}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
return dataframes
|
|||
|
|
except Exception as e:
|
|||
|
|
print(e)
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
dataframes = test_load_selected_dataframes()
|
|||
|
|
df_names = list(dataframes.keys())
|
|||
|
|
if len(df_names) >= 2:
|
|||
|
|
df_1 = dataframes[df_names[0]]
|
|||
|
|
df_2 = dataframes[df_names[1]]
|
|||
|
|
|
|||
|
|
print(f"DataFrame 1 ({df_names[0]}) 形状: {df_1.shape}")
|
|||
|
|
print(f"DataFrame 1 列名: {list(df_1.columns)}")
|
|||
|
|
print(f"DataFrame 1 前几行:")
|
|||
|
|
print(df_1.head())
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print(f"DataFrame 2 ({df_names[1]}) 形状: {df_2.shape}")
|
|||
|
|
print(f"DataFrame 2 列名: {list(df_2.columns)}")
|
|||
|
|
print(f"DataFrame 2 前几行:")
|
|||
|
|
print(df_2.head())
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 执行用户提供的数据处理逻辑
|
|||
|
|
print("执行数据处理逻辑...")
|
|||
|
|
result = execute(df_1, df_2)
|
|||
|
|
|
|||
|
|
print("处理结果:")
|
|||
|
|
print(f"结果形状: {result.shape}")
|
|||
|
|
print(f"结果列名: {list(result.columns)}")
|
|||
|
|
print("结果数据:")
|
|||
|
|
print(result)
|