Files
hyperspectral-hongshengrese…/Chla-time-plot.py

211 lines
8.2 KiB
Python
Raw Normal View History

2025-07-22 10:27:31 +08:00
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from datetime import datetime
# 设置叶绿素a阈值
CHLOROPHYLL_THRESHOLD = 100.0 # 大于此值的叶绿素a数据将被剔除
# 1. 读取CSV文件
try:
# 尝试自动检测文件格式
df = pd.read_csv(
r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv",
header=None, skip_blank_lines=True, on_bad_lines='warn')
# 检查列数,可能有多余列
if df.shape[1] > 2:
print(f"警告:检测到 {df.shape[1]}但预期只有2列。尝试提取前两列。")
df = df.iloc[:, :2] # 只保留前两列
# 重命名列
df.columns = ['Time', 'Chl_a']
# 打印前几行用于调试
print("原始数据预览:")
print(df.head())
# 检查是否有标题行
if not isinstance(df['Time'].iloc[0], str) or any(char.isalpha() for char in str(df['Time'].iloc[0])):
print("检测到可能的标题行,跳过第一行")
df = pd.read_csv(
r"D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\不去耀斑index.csv",
header=None, skiprows=1, skip_blank_lines=True)
df = df.iloc[:, :2] # 只保留前两列
df.columns = ['Time', 'Chl_a']
print("处理后数据预览:")
print(df.head())
except Exception as e:
print(f"读取文件时出错: {e}")
exit()
# 2. 转换时间格式
try:
# 尝试解析时间
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M')
# 检查是否有解析失败的时间
if df['Time'].isna().any():
print("部分时间解析失败,尝试其他格式...")
# 尝试其他常见格式
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y/%m/%d %H:%M:%S')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
# 如果仍有失败,尝试自动推断
if df['Time'].isna().any():
print("使用混合格式解析时间")
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='mixed')
# 删除无法解析时间的行
na_count = df['Time'].isna().sum()
if na_count > 0:
print(f"警告: 删除了 {na_count} 行无法解析的时间数据")
df = df.dropna(subset=['Time'])
# 提取日期和时间
df['Date'] = df['Time'].dt.date
df['Time_of_day'] = df['Time'].dt.time
except Exception as e:
print(f"时间解析错误: {e}")
exit()
# 3. 确保叶绿素a列是数值型并应用阈值过滤
try:
df['Chl_a'] = pd.to_numeric(df['Chl_a'], errors='coerce')
# 记录过滤前的数据量
original_count = len(df)
# 应用阈值过滤
above_threshold = df[df['Chl_a'] > CHLOROPHYLL_THRESHOLD]
if not above_threshold.empty:
print(f"警告: 发现 {len(above_threshold)} 行叶绿素a值超过阈值 {CHLOROPHYLL_THRESHOLD} μg/L将被剔除")
print(above_threshold)
df = df[df['Chl_a'] <= CHLOROPHYLL_THRESHOLD]
# 删除无效值
na_count = df['Chl_a'].isna().sum()
if na_count > 0:
print(f"警告: 删除了 {na_count} 行无效的叶绿素值")
df = df.dropna(subset=['Chl_a'])
# 报告过滤结果
filtered_count = len(df)
print(f"数据过滤结果: 原始 {original_count} 行 → 保留 {filtered_count}")
except Exception as e:
print(f"叶绿素值转换错误: {e}")
exit()
# 4. 按日期分组计算统计量
daily_stats = df.groupby('Date')['Chl_a'].agg(['mean', 'std', 'count']).reset_index()
daily_stats.columns = ['Date', 'Daily_Mean', 'Std_Dev', 'Sample_Count']
# 获取唯一日期
unique_dates = sorted(df['Date'].unique())
# 5. 创建绘图 - 箱型图
fig1, ax1 = plt.subplots(figsize=(14, 6))
# 绘制箱型图
ax1.boxplot([df[df['Date'] == date]['Chl_a'] for date in unique_dates], labels=[str(date) for date in unique_dates])
# 添加平均线
ax1.plot(range(1, len(unique_dates) + 1), daily_stats['Daily_Mean'], color='red', linestyle='--', label='Daily Mean')
ax1.set_title('Daily Chlorophyll-a Concentration with Box Plot', fontsize=14)
ax1.set_ylabel('Chl-a (μg/L)', fontsize=12)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# 添加图例
ax1.legend()
plt.tight_layout()
plt.savefig(
r'D:\WQ\zhanghuilai\hyperspectral-inversion\data\input\一代高光谱\2025-07-11_2025-07-21\plot\daily_chlorophyll_boxplot.png',
dpi=300, bbox_inches='tight')
#
# # 6. 创建每日时间序列图 - 每个日期一个子图
# fig2, axs = plt.subplots(len(unique_dates), 1, figsize=(14, 4 * len(unique_dates)), sharex=True)
#
# if len(unique_dates) == 1:
# axs = [axs] # 确保单个子图时也能正确处理
#
# fig2.suptitle('Chlorophyll-a Concentration Variation Throughout the Day', fontsize=16, y=0.98)
#
# for i, date in enumerate(unique_dates):
# daily_data = df[df['Date'] == date].sort_values('Time')
# ax = axs[i]
#
# # 使用真实时间作横轴
# ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='', linestyle='-', color='royalblue',
# label=f'Chl-a on {date}', alpha=0.8)
#
# # 添加统计信息
# daily_mean = daily_data['Chl_a'].mean()
# daily_std = daily_data['Chl_a'].std()
# ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7)
# ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3)
#
# ax.set_title(f"{date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=12)
# ax.set_ylabel('Chl-a (μg/L)', fontsize=10)
# ax.grid(True, linestyle='--', alpha=0.7)
#
# ax.legend(loc='upper left')
#
# # 自动优化时间刻度显示
# fig2.autofmt_xdate(rotation=45)
#
# # 保存每个日期的图像,确保文件名唯一
# save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png'
# fig2.savefig(save_path, dpi=300, bbox_inches='tight')
# 6. 为每个日期单独创建图像
for date in unique_dates:
daily_data = df[df['Date'] == date].sort_values('Time')
fig, ax = plt.subplots(figsize=(14, 4))
# 使用真实时间作横轴
ax.plot(daily_data['Time'], daily_data['Chl_a'], marker='o', linestyle='-', color='royalblue',
label=f'Chl-a on {date}', alpha=0.8)
# 添加统计信息
daily_mean = daily_data['Chl_a'].mean()
daily_std = daily_data['Chl_a'].std()
ax.axhline(y=daily_mean, color='g', linestyle='--', alpha=0.7, label='Daily Mean')
ax.axhspan(daily_mean - daily_std, daily_mean + daily_std, color='lightgreen', alpha=0.3, label='±1 Std Dev')
ax.set_title(f"Chlorophyll-a Time Series on {date.strftime('%Y-%m-%d')} (n={len(daily_data)})", fontsize=13)
ax.set_xlabel("Time", fontsize=11)
ax.set_ylabel("Chl-a (μg/L)", fontsize=11)
ax.grid(True, linestyle='--', alpha=0.7)
# 设置时间格式优化显示
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
fig.autofmt_xdate(rotation=45)
ax.legend(loc='upper left')
# 保存图像
save_path = f'D:/WQ/zhanghuilai/hyperspectral-inversion/data/input/一代高光谱/2025-07-11_2025-07-21/plot/daily_chlorophyll_timeseries_{date.strftime("%Y-%m-%d")}.png'
fig.savefig(save_path, dpi=300, bbox_inches='tight')
plt.close(fig) # 防止内存过多占用
# 打印数据统计摘要
print("\n数据统计摘要:")
print(f"叶绿素a阈值: {CHLOROPHYLL_THRESHOLD} μg/L")
print(f"总样本数: {len(df)}")
print(f"监测天数: {len(unique_dates)}")
print(f"平均每日样本数: {len(df) / len(unique_dates):.1f}")
print(f"叶绿素a总体均值: {df['Chl_a'].mean():.2f} ± {df['Chl_a'].std():.2f} μg/L")
print(f"叶绿素a最小值: {df['Chl_a'].min():.2f} μg/L")
print(f"叶绿素a最大值: {df['Chl_a'].max():.2f} μg/L")
print("\n每日统计:")
print(daily_stats)