文章目录
饼图垂直条形图水平条形图堆叠条形图水平交错条形图pandas模块之垂直或水平条形图pandas模块之水平交错条形图seaborn模块之垂直或水平条形图pandas模块之水平交错条形图matplotlib模块之直方图pandas模块之直方图和核密度图seaborn模块之分组的直方图和核密度图单个箱线图分组箱线图小提琴图单条折线图多条折线图pandas模块之单组散点图seaborn模块之分组散点图气泡图热力图词云分词
饼图
import matplotlib
.pyplot
as plt
edu
= [0.2515, 0.3724, 0.3336, 0.0368, 0.0057]
labels
= ['中专', '大专', '本科', '硕士', '其他']
plt
.rcParams
['font.sans-serif'] = ['SimHei']
plt
.rcParams
['axes.unicode_minus'] = False
plt
.pie
(x
=edu
,
labels
=labels
,
autopct
='%.1f%%'
)
plt
.title
('失信用户的教育水平分布')
plt
.show
()
explode
= [0, 0.1, 0, 0, 0]
colors
= ['#9999ff', '#ff9999', '#7777aa', '#2442aa', '#dd5555']
plt
.rcParams
['font.sans-serif'] = ['SimHei']
plt
.rcParams
['axes.unicode_minus'] = False
plt
.axes
(aspect
='equal')
plt
.pie
(x
=edu
,
explode
=explode
,
labels
=labels
,
colors
=colors
,
autopct
='%.1f%%',
pctdistance
=0.8,
labeldistance
=1.1,
startangle
=180,
radius
=1.2,
counterclock
=False,
wedgeprops
={'linewidth': 1.5, 'edgecolor': 'green'},
textprops
={'fontsize': 10, 'color': 'black'},
)
plt
.title
('失信用户的受教育水平分布')
plt
.show
()
import pandas
as pd
data1
= pd
.Series
({'中专': 0.2515, '大专': 0.3724, '本科': 0.3336, '硕士': 0.0368, '其他': 0.0057})
data1
.name
= ''
plt
.axes
(aspect
='equal')
data1
.plot
(kind
='pie',
autopct
='%.1f%%',
radius
=1,
startangle
=180,
counterclock
=False,
title
='失信用户的受教育水平分布',
wedgeprops
={'linewidth': 1.5, 'edgecolor': 'green'},
textprops
={'fontsize': 10, 'color': 'black'}
)
plt
.show
()
垂直条形图
GDP
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Province GDP 2017.xlsx')
plt
.style
.use
('ggplot')
plt
.bar
(x
=range(GDP
.shape
[0]),
height
=GDP
.GDP
,
tick_label
=GDP
.Province
,
color
='steelblue',
)
plt
.ylabel
('GDP(万亿)')
plt
.title
('2017年度6个省份GDP分布')
for x
, y
in enumerate(GDP
.GDP
):
plt
.text
(x
, y
+ 0.1, '%s' % round(y
, 1), ha
='center')
plt
.show
()
水平条形图
GDP
.sort_values
(by
='GDP', inplace
=True)
plt
.barh
(y
=range(GDP
.shape
[0]),
width
=GDP
.GDP
,
tick_label
=GDP
.Province
,
color
='steelblue',
)
plt
.xlabel
('GDP(万亿)')
plt
.title
('2017年度6个省份GDP分布')
for y
, x
in enumerate(GDP
.GDP
):
plt
.text
(x
+ 0.1, y
, '%s' % round(x
, 1), va
='center')
plt
.show
()
堆叠条形图
Industry_GDP
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Industry_GDP.xlsx')
Quarters
= Industry_GDP
.Quarter
.unique
()
Industry1
= Industry_GDP
.GPD
[Industry_GDP
.Industry_Type
== '第一产业']
Industry1
.index
= range(len(Quarters
))
Industry2
= Industry_GDP
.GPD
[Industry_GDP
.Industry_Type
== '第二产业']
Industry2
.index
= range(len(Quarters
))
Industry3
= Industry_GDP
.GPD
[Industry_GDP
.Industry_Type
== '第三产业']
plt
.bar
(x
=range(len(Quarters
)), height
=Industry1
, color
='steelblue', label
='第一产业', tick_label
=Quarters
)
plt
.bar
(x
=range(len(Quarters
)), height
=Industry2
, bottom
=Industry1
, color
='green', label
='第二产业')
plt
.bar
(x
=range(len(Quarters
)), height
=Industry3
, bottom
=Industry1
+ Industry2
, color
='red', label
='第三产业')
plt
.ylabel
('生成总值(亿)')
plt
.title
('2017年各季度三产业总值')
plt
.legend
()
plt
.show
()
水平交错条形图
import numpy
as np
HuRun
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\HuRun.xlsx')
Cities
= HuRun
.City
.unique
()
Counts2016
= HuRun
.Counts
[HuRun
.Year
== 2016]
Counts2017
= HuRun
.Counts
[HuRun
.Year
== 2017]
bar_width
= 0.4
plt
.bar
(x
=np
.arange
(len(Cities
)), height
=Counts2016
, label
='2016', color
='steelblue', width
=bar_width
)
plt
.bar
(x
=np
.arange
(len(Cities
)) + bar_width
, height
=Counts2017
, label
='2017', color
='indianred', width
=bar_width
)
plt
.xticks
(np
.arange
(5) + 0.2, Cities
)
plt
.ylabel
('亿万资产家庭数')
plt
.title
('近两年5个城市亿万资产家庭数比较')
plt
.legend
()
plt
.show
()
pandas模块之垂直或水平条形图
GDP
.GDP
.plot
(kind
='bar', width
=0.8, rot
=0, color
='steelblue', title
='2017年度6个省份GDP分布')
plt
.ylabel
('GDP(万亿)')
plt
.xticks
(range(len(GDP
.Province
)),
GDP
.Province
)
for x
, y
in enumerate(GDP
.GDP
):
plt
.text
(x
- 0.1, y
+ 0.2, '%s' % round(y
, 1), va
='center')
plt
.show
()
pandas模块之水平交错条形图
HuRun_reshape
= HuRun
.pivot_table
(index
='City', columns
='Year', values
='Counts').reset_index
()
HuRun_reshape
.sort_values
(by
=2016, ascending
=False, inplace
=True)
HuRun_reshape
.plot
(x
='City', y
=[2016, 2017], kind
='bar', color
=['steelblue', 'indianred'],
rot
=0,
width
=0.8, title
='近两年5个城市亿万资产家庭数比较')
plt
.ylabel
('亿万资产家庭数')
plt
.xlabel
('')
plt
.show
()
seaborn模块之垂直或水平条形图
import seaborn
as sns
sns
.barplot
(y
='Province',
x
='GDP',
data
=GDP
,
color
='steelblue',
orient
='horizontal'
)
plt
.xlabel
('GDP(万亿)')
plt
.ylabel
('')
plt
.title
('2017年度6个省份GDP分布')
for y
, x
in enumerate(GDP
.GDP
):
plt
.text
(x
, y
, '%s' % round(x
, 1), va
='center')
plt
.show
()
pandas模块之水平交错条形图
import pandas
as pd
Titanic
= pd
.read_csv
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\titanic_train.csv')
sns
.barplot
(x
='Pclass',
y
='Age',
hue
='Sex',
data
=Titanic
,
palette
='RdBu',
errcolor
='blue',
errwidth
=2,
saturation
=1,
capsize
=0.05
)
plt
.title
('各船舱等级中男女乘客的年龄差异')
plt
.show
()
matplotlib模块之直方图
any(Titanic
.Age
.isnull
())
Titanic
.dropna
(subset
=['Age'], inplace
=True)
plt
.hist
(x
=Titanic
.Age
,
bins
=20,
color
='steelblue',
edgecolor
='black'
)
plt
.xlabel
('年龄')
plt
.ylabel
('频数')
plt
.title
('乘客年龄分布')
plt
.show
()
pandas模块之直方图和核密度图
Titanic
.Age
.plot
(kind
='hist', bins
=20, color
='steelblue', edgecolor
='black', density
=True, label
='直方图')
Titanic
.Age
.plot
(kind
='kde', color
='red', label
='核密度图')
plt
.xlabel
('年龄')
plt
.ylabel
('核密度值')
plt
.title
('乘客年龄分布')
plt
.legend
()
plt
.show
()
seaborn模块之分组的直方图和核密度图
Age_Male
= Titanic
.Age
[Titanic
.Sex
== 'male']
Age_Female
= Titanic
.Age
[Titanic
.Sex
== 'female']
sns
.distplot
(Age_Male
, bins
=20, kde
=False, hist_kws
={'color': 'steelblue'}, label
='男性')
sns
.distplot
(Age_Female
, bins
=20, kde
=False, hist_kws
={'color': 'purple'}, label
='女性')
plt
.title
('男女乘客的年龄直方图')
plt
.legend
()
plt
.show
()
sns
.distplot
(Age_Male
, hist
=False, kde_kws
={'color': 'red', 'linestyle': '-'},
norm_hist
=True, label
='男性')
sns
.distplot
(Age_Female
, hist
=False, kde_kws
={'color': 'black', 'linestyle': '--'},
norm_hist
=True, label
='女性')
plt
.title
('男女乘客的年龄核密度图')
plt
.legend
()
plt
.show
()
单个箱线图
Sec_Buildings
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\sec_buildings.xlsx')
plt
.boxplot
(x
=Sec_Buildings
.price_unit
,
patch_artist
=True,
showmeans
=True,
boxprops
={'color': 'black', 'facecolor': 'steelblue'},
flierprops
={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops
={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops
={'linestyle': '--', 'color': 'orange'},
labels
=['']
)
plt
.title
('二手房单价分布的箱线图')
plt
.show
()
分组箱线图
group_region
= Sec_Buildings
.groupby
('region')
avg_price
= group_region
.aggregate
({'price_unit': np
.mean
}).sort_values
('price_unit', ascending
=False)
region_price
= []
for region
in avg_price
.index
:
region_price
.append
(Sec_Buildings
.price_unit
[Sec_Buildings
.region
== region
])
plt
.boxplot
(x
=region_price
,
patch_artist
=True,
labels
=avg_price
.index
,
showmeans
=True,
boxprops
={'color': 'black', 'facecolor': 'steelblue'},
flierprops
={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops
={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops
={'linestyle': '--', 'color': 'orange'}
)
plt
.ylabel
('单价(元)')
plt
.title
('不同行政区域的二手房单价对比')
plt
.show
()
sns
.boxplot
(x
='region', y
='price_unit', data
=Sec_Buildings
,
order
=avg_price
.index
, showmeans
=True, color
='steelblue',
flierprops
={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops
={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops
={'linestyle': '--', 'color': 'orange'}
)
plt
.xlabel
('')
plt
.ylabel
('单价(元)')
plt
.title
('不同行政区域的二手房单价对比')
plt
.show
()
小提琴图
tips
= pd
.read_csv
(r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\tips.csv')
sns
.violinplot
(x
="total_bill",
y
="day",
hue
="sex",
data
=tips
,
order
=['Thur', 'Fri', 'Sat', 'Sun'],
scale
='count',
split
=True,
palette
='RdBu'
)
plt
.title
('每天不同性别客户的消费额情况')
plt
.legend
(loc
='upper center', ncol
=2)
plt
.show
()
单条折线图
wechat
= pd
.read_excel
(r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\wechat.xlsx')
plt
.plot
(wechat
.Date
,
wechat
.Counts
,
linestyle
='-',
linewidth
=2,
color
='steelblue',
marker
='o',
markersize
=6,
markeredgecolor
='black',
markerfacecolor
='brown')
plt
.ylabel
('人数')
plt
.title
('每天微信文章阅读人数趋势')
plt
.show
()
多条折线图
import matplotlib
as mpl
plt
.plot
(wechat
.Date
,
wechat
.Counts
,
linestyle
='-',
color
='steelblue',
label
='阅读人数'
)
plt
.plot
(wechat
.Date
,
wechat
.Times
,
linestyle
='--',
color
='indianred',
label
='阅读人次'
)
ax
= plt
.gca
()
date_format
= mpl
.dates
.DateFormatter
("%m-%d")
ax
.xaxis
.set_major_formatter
(date_format
)
xlocator
= mpl
.ticker
.MultipleLocator
(7)
ax
.xaxis
.set_major_locator
(xlocator
)
plt
.xticks
(rotation
=45)
plt
.ylabel
('人数')
plt
.title
('每天微信文章阅读人数与人次趋势')
plt
.legend
()
plt
.show
()
weather
= pd
.read_excel
(r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\weather.xlsx')
data
= weather
.pivot_table
(index
='month', columns
='year', values
='high')
data
.plot
(kind
='line',
style
=['-', '--', ':']
)
plt
.xlabel
('月份')
plt
.ylabel
('气温')
plt
.title
('每月平均最高气温波动趋势')
plt
.show
()
pandas模块之单组散点图
iris
= pd
.read_csv
(r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\iris.csv')
plt
.scatter
(x
=iris
.Petal_Width
,
y
=iris
.Petal_Length
,
color
='steelblue'
)
plt
.xlabel
('花瓣宽度')
plt
.ylabel
('花瓣长度')
plt
.title
('鸢尾花的花瓣宽度与长度关系')
plt
.show
()
iris
.plot
(x
='Petal_Width', y
='Petal_Length', kind
='scatter', title
='鸢尾花的花瓣宽度与长度关系')
plt
.xlabel
('花瓣宽度')
plt
.ylabel
('花瓣长度')
plt
.show
()
seaborn模块之分组散点图
sns
.lmplot
(x
='Petal_Width',
y
='Petal_Length',
hue
='Species',
data
=iris
,
legend_out
=False,
truncate
=True
)
plt
.xlabel
('花瓣宽度')
plt
.ylabel
('花瓣长度')
plt
.title
('鸢尾花的花瓣宽度与长度关系')
plt
.show
()
气泡图
Prod_Category
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\SuperMarket.xlsx')
range_diff
= Prod_Category
.Profit_Ratio
.max() - Prod_Category
.Profit_Ratio
.min()
Prod_Category
['std_ratio'] = (Prod_Category
.Profit_Ratio
- Prod_Category
.Profit_Ratio
.min()) / range_diff
+ 0.001
plt
.scatter
(x
=Prod_Category
.Sales
[Prod_Category
.Category
== '办公用品'],
y
=Prod_Category
.Profit
[Prod_Category
.Category
== '办公用品'],
s
=Prod_Category
.std_ratio
[Prod_Category
.Category
== '办公用品'] * 1000,
color
='steelblue', label
='办公用品', alpha
=0.6
)
plt
.scatter
(x
=Prod_Category
.Sales
[Prod_Category
.Category
== '技术产品'],
y
=Prod_Category
.Profit
[Prod_Category
.Category
== '技术产品'],
s
=Prod_Category
.std_ratio
[Prod_Category
.Category
== '技术产品'] * 1000,
color
='indianred', label
='技术产品', alpha
=0.6
)
plt
.scatter
(x
=Prod_Category
.Sales
[Prod_Category
.Category
== '家具产品'],
y
=Prod_Category
.Profit
[Prod_Category
.Category
== '家具产品'],
s
=Prod_Category
.std_ratio
[Prod_Category
.Category
== '家具产品'] * 1000,
color
='black', label
='家具产品', alpha
=0.6
)
plt
.xlabel
('销售额')
plt
.ylabel
('利润')
plt
.title
('销售额、利润及利润率的气泡图')
plt
.legend
()
plt
.show
()
热力图
Sales
= pd
.read_excel
(r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Sales.xlsx')
Sales
['year'] = Sales
.Date
.dt
.year
Sales
['month'] = Sales
.Date
.dt
.month
Summary
= Sales
.pivot_table
(index
='month', columns
='year', values
='Sales', aggfunc
=np
.sum)
sns
.heatmap
(data
=Summary
,
cmap
='PuBuGn',
linewidths
=.1,
annot
=True,
fmt
='.1e'
)
plt
.title
('每年各月份销售总额热力图')
plt
.show
()
Prod_Trade
= pd
.read_excel
(
r
'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Prod_Trade.xlsx')
Prod_Trade
['year'] = Prod_Trade
.Date
.dt
.year
Prod_Trade
['month'] = Prod_Trade
.Date
.dt
.month
plt
.figure
(figsize
=(12, 6))
ax1
= plt
.subplot2grid
(shape
=(2, 3), loc
=(0, 0))
Class_Counts
= Prod_Trade
.Order_Class
[Prod_Trade
.year
== 2012].value_counts
()
Class_Percent
= Class_Counts
/ Class_Counts
.sum()
ax1
.set_aspect
(aspect
='equal')
ax1
.pie
(x
=Class_Percent
.values
, labels
=Class_Percent
.index
, autopct
='%.1f%%')
ax1
.set_title
('各等级订单比例')
ax2
= plt
.subplot2grid
(shape
=(2, 3), loc
=(0, 1))
Month_Sales
= Prod_Trade
[Prod_Trade
.year
== 2012].groupby
(by
='month').aggregate
({'Sales': np
.sum})
Month_Sales
.plot
(title
='2012年各月销售趋势', ax
=ax2
, legend
=False)
ax2
.set_xlabel
('')
ax3
= plt
.subplot2grid
(shape
=(2, 3), loc
=(0, 2), rowspan
=2)
sns
.boxplot
(x
='Transport', y
='Trans_Cost', data
=Prod_Trade
, ax
=ax3
)
ax3
.set_title
('各运输方式成本分布')
ax3
.set_xlabel
('')
ax3
.set_ylabel
('运输成本')
ax4
= plt
.subplot2grid
(shape
=(2, 3), loc
=(1, 0), colspan
=2)
sns
.distplot
(Prod_Trade
.Sales
[Prod_Trade
.year
== 2012], bins
=40, norm_hist
=True, ax
=ax4
,
hist_kws
={'color': 'steelblue'}, kde_kws
=({'linestyle': '--', 'color': 'red'}))
ax4
.set_title
('2012年客单价分布图')
ax4
.set_xlabel
('销售额')
plt
.subplots_adjust
(hspace
=0.6, wspace
=0.3)
plt
.show
()
词云分词
import jieba
import wordcloud
text
= open(r
'text.txt',encoding
='utf-8').read
()
def split_words(text
):
cut_text
= jieba
.cut
(text
)
string
= ','.join
(cut_text
)
stop_words
= ['我们', '你们',]
word_cloud
= wordcloud
.WordCloud
(
font_path
=r
'.\simhei.ttf',
background_color
='white',
width
=500,
height
=350,
max_font_size
=100,
min_font_size
=10,
stopwords
=stop_words
,
scale
=15,
)
word_cloud
.generate
(string
)
word_cloud
.to_file
(r
'词云分词.png')
split_words
(text
=text
)
转载请注明原文地址: https://yun.8miu.com/read-136055.html