1. 数据清洗与基本统计分析¶
(1) 数据导入和预览¶
In [ ]:
import pandas as pd
# 导入数据
df = pd.read_csv('transactions.csv')
# 预览数据
print(df.head())
transaction_id date amount description account_id 0 1 2022-01-01 761.59 Subscription Service 104 1 2 2022-01-02 1902.41 Office Supplies 103 2 3 2022-01-03 1469.35 Travel Expenses 106 3 4 2022-01-04 1205.34 Travel Expenses 108 4 5 2022-01-05 328.92 Marketing Campaign 105
(2)数据清洗¶
In [ ]:
# 检查缺失值
print(df.isnull().sum())
# 填充缺失值或删除
df['description'].fillna('Unknown', inplace=True)
# 转换日期列为datetime类型
df['date'] = pd.to_datetime(df['date'])
transaction_id 0 date 0 amount 0 description 0 account_id 0 dtype: int64
(3) 基本统计分析¶
In [ ]:
print(df['amount'].describe())
count 100.000000 mean 950.957800 std 589.028528 min 30.930000 25% 402.540000 50% 939.005000 75% 1465.802500 max 1974.040000 Name: amount, dtype: float64
(4)Pandas中的类SQL操作¶
In [ ]:
# 选择特定列
selected_columns = df[['transaction_id', 'amount', 'account_id']]
print(selected_columns.head())
transaction_id amount account_id 0 1 761.59 104 1 2 1902.41 103 2 3 1469.35 106 3 4 1205.34 108 4 5 328.92 105
In [ ]:
# 过滤出金额大于1000的交易
filtered_df = df[df['amount'] > 1000]
print(filtered_df.head())
transaction_id date amount description account_id 1 2 2022-01-02 1902.41 Office Supplies 103 2 3 2022-01-03 1469.35 Travel Expenses 106 3 4 2022-01-04 1205.34 Travel Expenses 108 7 8 2022-01-08 1735.03 Stationery 104 8 9 2022-01-09 1210.21 Travel Expenses 103
In [ ]:
# 过滤出金额大于1000且摘要为'Travel Expenses'的交易
filtered_df = df[(df['amount'] > 1500) & (df['description'] == 'Travel Expenses')]
print(filtered_df.head())
# 过滤出金额大于1000或摘要为'Travel Expenses'的交易
filtered_df = df[(df['amount'] > 1500) | (df['description'] == 'Travel Expenses')]
print(filtered_df.head())
transaction_id date amount description account_id 35 36 2022-02-05 1620.63 Travel Expenses 102 53 54 2022-02-23 1791.76 Travel Expenses 108 transaction_id date amount description account_id 1 2 2022-01-02 1902.41 Office Supplies 103 2 3 2022-01-03 1469.35 Travel Expenses 106 3 4 2022-01-04 1205.34 Travel Expenses 108 7 8 2022-01-08 1735.03 Stationery 104 8 9 2022-01-09 1210.21 Travel Expenses 103
In [ ]:
# 按交易金额降序排序
sorted_df = df.sort_values(by='amount', ascending=False)
print(sorted_df.head())
transaction_id date amount description account_id 69 70 2022-03-11 1974.04 Software Purchase 104 11 12 2022-01-12 1940.42 Equipment Maintenance 102 50 51 2022-02-20 1939.78 Conference Fees 104 34 35 2022-02-04 1931.95 Equipment Purchase 106 1 2 2022-01-02 1902.41 Office Supplies 103
In [ ]:
# 按账户ID计算每ID的平均交易金额
grouped_df = df.groupby('account_id')['amount'].mean().reset_index()
print(grouped_df)
account_id amount 0 101 749.699412 1 102 1015.118889 2 103 952.278750 3 104 1171.691765 4 105 1106.192000 5 106 996.977000 6 107 907.305714 7 108 853.038421
In [ ]:
# 导入账户数据
accounts = pd.read_csv('accounts.csv')
# 合并交易数据和账户数据
merged_df = df.merge(accounts, on='account_id', how='left')
print(merged_df.head())
transaction_id date amount description account_id \
0 1 2022-01-01 761.59 Subscription Service 104
1 2 2022-01-02 1902.41 Office Supplies 103
2 3 2022-01-03 1469.35 Travel Expenses 106
3 4 2022-01-04 1205.34 Travel Expenses 108
4 5 2022-01-05 328.92 Marketing Campaign 105
account_name account_type
0 Travel Expense
1 Maintenance Service
2 Subscriptions Service
3 Renovations Project
4 Conferences Event
In [ ]:
# 创建透视表,按账户类型和交易日期计算总金额
pivot_table = pd.pivot_table(df, values='amount', index='date', columns='account_id', aggfunc='sum')
print(pivot_table.head())
account_id 101 102 103 104 105 106 107 108 date 2022-01-01 NaN NaN NaN 761.59 NaN NaN NaN NaN 2022-01-02 NaN NaN 1902.41 NaN NaN NaN NaN NaN 2022-01-03 NaN NaN NaN NaN NaN 1469.35 NaN NaN 2022-01-04 NaN NaN NaN NaN NaN NaN NaN 1205.34 2022-01-05 NaN NaN NaN NaN 328.92 NaN NaN NaN
In [ ]:
# 删除重复的交易记录
unique_df = df.drop_duplicates()
print(unique_df.head())
transaction_id date amount description account_id 0 1 2022-01-01 761.59 Subscription Service 104 1 2 2022-01-02 1902.41 Office Supplies 103 2 3 2022-01-03 1469.35 Travel Expenses 106 3 4 2022-01-04 1205.34 Travel Expenses 108 4 5 2022-01-05 328.92 Marketing Campaign 105
In [ ]:
# 添加一列,表示交易金额是否大于1000
df['amount_gt_1000'] = df['amount'] > 1000
print(df.head())
transaction_id date amount description account_id \ 0 1 2022-01-01 761.59 Subscription Service 104 1 2 2022-01-02 1902.41 Office Supplies 103 2 3 2022-01-03 1469.35 Travel Expenses 106 3 4 2022-01-04 1205.34 Travel Expenses 108 4 5 2022-01-05 328.92 Marketing Campaign 105 amount_gt_1000 0 False 1 True 2 True 3 True 4 False
2.财务报表分析与可视化¶
In [ ]:
# 导入账户数据
accounts = pd.read_csv('accounts.csv')
# 合并交易数据和账户数据
df = df.merge(accounts, on='account_id', how='left')
# 按账户类型聚合数据并可视化
import matplotlib.pyplot as plt
# 按账户类型计算总金额
grouped = df.groupby('account_type')['amount'].sum().reset_index()
# 可视化
plt.figure(figsize=(10, 6))
plt.bar(grouped['account_type'], grouped['amount'])
plt.xlabel('Account Type')
plt.ylabel('Total Amount')
plt.title('Total Amount by Account Type')
plt.show()
df.merge(accounts, on='account_id', how='left'): 此行根据共同的 account_id 列合并 transactions 和 accounts DataFrame。how='left' 参数确保保留 transactions 中的所有记录,即使在 accounts 中没有匹配的记录。¶
df.groupby('account_type')['amount'].sum().reset_index(): 此行将 transactions DataFrame 按 account_type 分组,计算每个组的总金额 (sum()),然后重置索引以创建新的 DataFrame 名为 grouped。¶
plt.figure(figsize=(10, 6)): 此行创建用于可视化的新图形,并指定宽度和高度。¶
plt.bar(grouped['account_type'], grouped['amount']): 此行创建条形图,其中 x 轴代表 account_type,y 轴代表相应的总金额。¶
plt.xlabel('Account Type'), plt.ylabel('Total Amount'), plt.title('Total Amount by Account Type'): 这些行设置图表标签和标题。¶
plt.show(): 此行显示生成的可视化。¶
3. 现金流量表分析实践¶
(1) 生成现金流量表¶
导入数据并预处理¶
In [ ]:
import pandas as pd
# 导入数据
transactions = pd.read_csv('transactions.csv')
accounts = pd.read_csv('accounts.csv')
# 转换日期列为datetime类型
transactions['date'] = pd.to_datetime(transactions['date'])
# 合并交易数据和账户数据
df = transactions.merge(accounts, on='account_id', how='left')
分类现金流量:将交易分类为运营活动、投资活动和筹资活动¶
In [ ]:
# 定义现金流类型
def classify_cash_flow(description):
if 'Supplies' in description or 'Expenses' in description or 'Fees' in description:
return 'Operating'
elif 'Purchase' in description or 'Renovation' in description or 'Maintenance' in description:
return 'Investing'
elif 'Campaign' in description or 'Conference' in description or 'Subscription' in description:
return 'Financing'
else:
return 'Operating'
# 应用分类
df['cash_flow_type'] = df['description'].apply(classify_cash_flow)
按月计算各类现金流量和净现金流量¶
In [ ]:
# 添加月份列
df['month'] = df['date'].dt.to_period('M')
# 按月和现金流类型聚合数据
monthly_cash_flow = df.groupby(['month', 'cash_flow_type'])['amount'].sum().fillna(0)
print(monthly_cash_flow)
monthly_cash_flow = monthly_cash_flow.unstack()
print(monthly_cash_flow)
# 计算净现金流量
monthly_cash_flow['Net Cash Flow'] = monthly_cash_flow.sum(axis=1)
print(monthly_cash_flow)
month cash_flow_type
2022-01 Financing 4562.71
Investing 6093.60
Operating 17219.31
2022-02 Financing 3195.23
Investing 7049.78
Operating 17954.31
2022-03 Financing 5503.84
Investing 7020.43
Operating 17380.13
2022-04 Financing 1526.35
Investing 1780.15
Operating 5809.94
Name: amount, dtype: float64
cash_flow_type Financing Investing Operating
month
2022-01 4562.71 6093.60 17219.31
2022-02 3195.23 7049.78 17954.31
2022-03 5503.84 7020.43 17380.13
2022-04 1526.35 1780.15 5809.94
cash_flow_type Financing Investing Operating Net Cash Flow
month
2022-01 4562.71 6093.60 17219.31 27875.62
2022-02 3195.23 7049.78 17954.31 28199.32
2022-03 5503.84 7020.43 17380.13 29904.40
2022-04 1526.35 1780.15 5809.94 9116.44
(2)可视化各类现金流量和净现金流量¶
In [ ]:
import matplotlib.pyplot as plt
# 设置图形尺寸
plt.figure(figsize=(12, 8))
# 绘制各类现金流量图
for column in monthly_cash_flow.columns[:-1]:
plt.plot(monthly_cash_flow.index.to_timestamp(), monthly_cash_flow[column], marker='o', label=column)
# 绘制净现金流量图
plt.plot(monthly_cash_flow.index.to_timestamp(), monthly_cash_flow['Net Cash Flow'], marker='o', linestyle='--', color='black', label='Net Cash Flow')
# 添加图例和标签
plt.xlabel('Month')
plt.ylabel('Cash Flow Amount')
plt.title('Monthly Cash Flow Analysis')
plt.legend()
plt.grid(True)
plt.show()
(3)趋势分析:现金流量滚动平均值¶
In [ ]:
# 计算3个月滚动平均值
rolling_window = 3
for column in monthly_cash_flow.columns:
monthly_cash_flow[f'{column} (Rolling Avg)'] = monthly_cash_flow[column].rolling(window=rolling_window).mean()
print(monthly_cash_flow)
cash_flow_type Financing Investing Operating Net Cash Flow \ month 2022-01 4562.71 6093.60 17219.31 27875.62 2022-02 3195.23 7049.78 17954.31 28199.32 2022-03 5503.84 7020.43 17380.13 29904.40 2022-04 1526.35 1780.15 5809.94 9116.44 cash_flow_type Financing (Rolling Avg) Investing (Rolling Avg) \ month 2022-01 NaN NaN 2022-02 NaN NaN 2022-03 4420.593333 6721.270000 2022-04 3408.473333 5283.453333 cash_flow_type Operating (Rolling Avg) Net Cash Flow (Rolling Avg) month 2022-01 NaN NaN 2022-02 NaN NaN 2022-03 17517.916667 28659.78 2022-04 13714.793333 22406.72
In [ ]:
plt.figure(figsize=(12, 8))
# 绘制滚动平均值图
for column in monthly_cash_flow.columns[len(monthly_cash_flow.columns)//2:]:
plt.plot(monthly_cash_flow.index.to_timestamp(), monthly_cash_flow[column], marker='o', linestyle='--', label=column)
# 添加图例和标签
plt.xlabel('Month')
plt.ylabel('Cash Flow Amount (Rolling Avg)')
plt.title('Monthly Cash Flow Analysis with Rolling Average')
plt.legend()
plt.grid(True)
plt.show()
(4)异常检测:识别和解释¶
In [ ]:
# 定义异常标准:净现金流量超出滚动平均值的threshold倍标准差
threshold = 1.15
monthly_cash_flow['Deviation'] = monthly_cash_flow['Net Cash Flow'] - monthly_cash_flow['Net Cash Flow (Rolling Avg)']
monthly_cash_flow['Std Dev'] = monthly_cash_flow['Net Cash Flow'].rolling(window=rolling_window).std()
monthly_cash_flow['Anomaly'] = monthly_cash_flow['Deviation'].abs() > (threshold * monthly_cash_flow['Std Dev'])
# 标记异常月份
anomalies = monthly_cash_flow[monthly_cash_flow['Anomaly']]
print(anomalies)
cash_flow_type Financing Investing Operating Net Cash Flow \ month 2022-04 1526.35 1780.15 5809.94 9116.44 cash_flow_type Financing (Rolling Avg) Investing (Rolling Avg) \ month 2022-04 3408.473333 5283.453333 cash_flow_type Operating (Rolling Avg) Net Cash Flow (Rolling Avg) \ month 2022-04 13714.793333 22406.72 cash_flow_type Deviation Std Dev Anomaly month 2022-04 -13290.28 11541.251289 True
In [ ]:
plt.figure(figsize=(12, 8))
# 绘制净现金流量和异常标记
plt.plot(monthly_cash_flow.index.to_timestamp(), monthly_cash_flow['Net Cash Flow'], marker='o', label='Net Cash Flow')
plt.scatter(anomalies.index.to_timestamp(), anomalies['Net Cash Flow'], color='red', label='Anomaly', zorder=5)
# 添加图例和标签
plt.xlabel('Month')
plt.ylabel('Net Cash Flow Amount')
plt.title('Net Cash Flow with Anomaly Detection')
plt.legend()
plt.grid(True)
plt.show()
4. Pandas中的透视表¶
In [ ]:
# 创建数据透视表,按账户类型和月份计算总金额
pivot_table = pd.pivot_table(df, values='amount', index='account_type', columns='month', aggfunc='sum', fill_value=0)
print(pivot_table)
month 2022-01 2022-02 2022-03 2022-04 account_type Department 2863.44 2877.10 3395.53 0.00 Event 328.92 3434.97 635.74 1131.33 Expense 12328.19 12846.56 9303.93 4536.11 Project 4297.04 4786.20 5598.14 1526.35 Service 8058.03 4254.49 10971.06 1922.65
pd.pivot_table 方法:¶
pd.pivot_table 是 pandas 中用于创建透视表的方法。¶
参数解释:¶
df:这是包含数据的 DataFrame。¶
values='amount':指定要聚合的值列,即 amount 列中的数据。¶
index='account_type':指定行索引,按 account_type 列进行分组。¶
columns='month':指定列索引,按 month 列进行分组。¶
aggfunc='sum':指定聚合函数,这里使用 sum 对每个分组的 amount 值进行求和。¶
fill_value=0:指定填充值,如果某个单元格中没有数据,用 0 填充。¶
In [ ]:
# 假设需要计算每个 account_type 在每个月的平均金额
pivot_table_mean = pd.pivot_table(
df,
values='amount',
index='account_type',
columns='month',
aggfunc='mean',
fill_value=0
)
print(pivot_table_mean)
month 2022-01 2022-02 2022-03 2022-04 account_type Department 1431.720000 719.275000 1131.843333 0.000000 Event 328.920000 1717.485000 635.740000 1131.330000 Expense 821.879333 988.196923 1162.991250 907.222000 Project 859.408000 1196.550000 622.015556 1526.350000 Service 1007.253750 850.898000 1097.106000 640.883333
In [ ]:
# 热力图可视化数据透视表
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('Total Amount by Account Type and Month')
plt.show()
练习¶
1: 数据清洗与基本统计分析¶
背景:你是一名数据分析师,负责清洗和分析公司财务交易数据。
任务:
导入transactions.csv数据,检查并处理缺失值。
计算并显示交易金额的基本统计量(如均值、中位数、标准差)。
In [ ]:
import pandas as pd
# 导入数据
df = pd.read_csv('transactions.csv')
# 检查缺失值
print(df.isnull().sum())
# 填充缺失值
df['description'].fillna('Unknown', inplace=True)
df['amount'].fillna(df['amount'].mean(), inplace=True)
transaction_id 0 date 0 amount 0 description 0 account_id 0 dtype: int64
C:\Users\shen\AppData\Local\Temp\ipykernel_17288\802481515.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['description'].fillna('Unknown', inplace=True)
C:\Users\shen\AppData\Local\Temp\ipykernel_17288\802481515.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['amount'].fillna(df['amount'].mean(), inplace=True)
In [ ]:
# 计算交易金额的基本统计量
amount_stats = df['amount'].describe()
print(amount_stats)
count 100.000000 mean 950.957800 std 589.028528 min 30.930000 25% 402.540000 50% 939.005000 75% 1465.802500 max 1974.040000 Name: amount, dtype: float64
2:财务报表分析与可视化¶
背景:你需要对公司不同类型的财务交易进行分析,并生成可视化报表。
任务:
合并transactions.csv和accounts.csv数据,显示交易ID、日期、金额、描述和账户名称。
按账户类型计算每个月的总交易金额,并进行可视化。
In [ ]:
# 导入数据
transactions = pd.read_csv('transactions.csv')
accounts = pd.read_csv('accounts.csv')
# 合并数据
df = transactions.merge(accounts, on='account_id', how='left')
# 显示相关字段
print(df[['transaction_id', 'date', 'amount', 'description', 'account_name']].head())
transaction_id date amount description account_name 0 1 2022-01-01 761.59 Subscription Service Travel 1 2 2022-01-02 1902.41 Office Supplies Maintenance 2 3 2022-01-03 1469.35 Travel Expenses Subscriptions 3 4 2022-01-04 1205.34 Travel Expenses Renovations 4 5 2022-01-05 328.92 Marketing Campaign Conferences
In [ ]:
import matplotlib.pyplot as plt
# 将日期列转换为datetime类型
df['date'] = pd.to_datetime(df['date'])
# 添加月份列
df['month'] = df['date'].dt.to_period('M')
# 按月和账户类型聚合数据
monthly_summary = df.groupby(['month', 'account_type'])['amount'].sum().unstack().fillna(0)
# 可视化
monthly_summary.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.xlabel('Month')
plt.ylabel('Total Amount')
plt.title('Monthly Total Amount by Account Type')
plt.legend(title='Account Type')
plt.show()
In [ ]:
import pandas as pd
# 导入数据
transactions = pd.read_csv('transactions.csv')
# 将日期列转换为datetime类型
transactions['date'] = pd.to_datetime(transactions['date'])
# 添加月份列
transactions['month'] = transactions['date'].dt.to_period('M')
# 按月聚合数据,计算总现金流量
monthly_cash_flow = transactions.groupby('month')['amount'].sum().reset_index()
monthly_cash_flow['month'] = monthly_cash_flow['month'].dt.to_timestamp()
print(monthly_cash_flow)
month amount 0 2022-01-01 27875.62 1 2022-02-01 28199.32 2 2022-03-01 29904.40 3 2022-04-01 9116.44
In [ ]:
import matplotlib.pyplot as plt
# 计算三个月滚动平均值
monthly_cash_flow['moving_avg'] = monthly_cash_flow['amount'].rolling(window=3).mean()
# 可视化
plt.figure(figsize=(12, 8))
plt.plot(monthly_cash_flow['month'], monthly_cash_flow['amount'], marker='o', label='Total Cash Flow')
plt.plot(monthly_cash_flow['month'], monthly_cash_flow['moving_avg'], marker='o', linestyle='--', label='3-Month Moving Average')
plt.xlabel('Month')
plt.ylabel('Total Cash Flow')
plt.title('Monthly Cash Flow with 3-Month Moving Average')
plt.legend()
plt.grid(True)
plt.show()