import numpy as np
import pandas as pd
from pandas import Series, DataFrame
s1 = Series([1,2,3],index=['A','B','C'])
s1
A 1
B 2
C 3
dtype: int64
s2 = Series([4,5,6,7],index=['B','C','D','E'])
s2
B 4
C 5
D 6
E 7
dtype: int64
s1 + s2
A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64
对应index相加,对不上的就是NaN
DataFrame的运算
df_a = DataFrame(np.arange(4).reshape(2,2),index=['A','B'],columns=['北京','上海'])
df_b = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['北京','上海','广州'])
df_a
df_b
|
北京 |
上海 |
广州 |
A |
0 |
1 |
2 |
B |
3 |
4 |
5 |
C |
6 |
7 |
8 |
df_a + df_b
|
上海 |
北京 |
广州 |
A |
2.0 |
0.0 |
NaN |
B |
7.0 |
5.0 |
NaN |
C |
NaN |
NaN |
NaN |
类似的,index和columns对应的部分可以相加,否则为NaN
df_c = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],index=['A','B','C'],columns=['c1','c2','c3'])
df_c
|
c1 |
c2 |
c3 |
A |
1 |
2 |
3.0 |
B |
4 |
5 |
NaN |
C |
7 |
8 |
9.0 |
df_c.sum()
c1 12.0
c2 15.0
c3 12.0
dtype: float64
df_c.sum(axis = 1)
A 6.0
B 9.0
C 24.0
dtype: float64
type(df_c.sum())
pandas.core.series.Series
DataFrame中求和的时候会忽略NaN
axis = 1 可以指定行的计算
df_c.describe()
|
c1 |
c2 |
c3 |
count |
3.0 |
3.0 |
2.000000 |
mean |
4.0 |
5.0 |
6.000000 |
std |
3.0 |
3.0 |
4.242641 |
min |
1.0 |
2.0 |
3.000000 |
25% |
2.5 |
3.5 |
4.500000 |
50% |
4.0 |
5.0 |
6.000000 |
75% |
5.5 |
6.5 |
7.500000 |
max |
7.0 |
8.0 |
9.000000 |
s1.index
Index(['A', 'B', 'C'], dtype='object')
s1.sort_values()#按value升序
A 1
B 2
C 3
dtype: int64
s1.sort_values(ascending=False)#按value降序
C 3
B 2
A 1
dtype: int64
s1.sort_index(ascending=False)#按index降序
C 3
B 2
A 1
dtype: int64
df_d = DataFrame(np.random.randn(35).reshape(7,5),columns=['A','B','C','D','E'])
df_d
|
A |
B |
C |
D |
E |
0 |
-0.694245 |
-0.302792 |
0.667865 |
0.447782 |
-0.413812 |
1 |
-0.502081 |
-1.849090 |
1.885715 |
-1.117864 |
0.406936 |
2 |
0.384877 |
0.076701 |
-1.052755 |
-0.709675 |
0.272562 |
3 |
-1.194740 |
-0.518320 |
-0.139549 |
-0.745238 |
1.270952 |
4 |
-1.266443 |
-1.163004 |
-0.644873 |
-0.333446 |
0.349508 |
5 |
-0.695937 |
-0.589887 |
1.475200 |
0.278659 |
2.207159 |
6 |
-0.712247 |
0.171372 |
0.268192 |
0.138490 |
0.604858 |
df_d.sort_values('A',ascending=False)#按A列降序
|
A |
B |
C |
D |
E |
2 |
0.384877 |
0.076701 |
-1.052755 |
-0.709675 |
0.272562 |
1 |
-0.502081 |
-1.849090 |
1.885715 |
-1.117864 |
0.406936 |
0 |
-0.694245 |
-0.302792 |
0.667865 |
0.447782 |
-0.413812 |
5 |
-0.695937 |
-0.589887 |
1.475200 |
0.278659 |
2.207159 |
6 |
-0.712247 |
0.171372 |
0.268192 |
0.138490 |
0.604858 |
3 |
-1.194740 |
-0.518320 |
-0.139549 |
-0.745238 |
1.270952 |
4 |
-1.266443 |
-1.163004 |
-0.644873 |
-0.333446 |
0.349508 |
df_d.sort_index(ascending=False)#按index降序
|
A |
B |
C |
D |
E |
6 |
-0.712247 |
0.171372 |
0.268192 |
0.138490 |
0.604858 |
5 |
-0.695937 |
-0.589887 |
1.475200 |
0.278659 |
2.207159 |
4 |
-1.266443 |
-1.163004 |
-0.644873 |
-0.333446 |
0.349508 |
3 |
-1.194740 |
-0.518320 |
-0.139549 |
-0.745238 |
1.270952 |
2 |
0.384877 |
0.076701 |
-1.052755 |
-0.709675 |
0.272562 |
1 |
-0.502081 |
-1.849090 |
1.885715 |
-1.117864 |
0.406936 |
0 |
-0.694245 |
-0.302792 |
0.667865 |
0.447782 |
-0.413812 |
所谓课后练习 ——> 将一个已有csv文件中的三列提取出来,按其中一项降序排列,要求用一行完成(真是。。)。
import pandas as pd
csv_file = open("E:/Python3数据科学入门与实战/project/o25mso/homework/movie_metadata.csv","r",encoding="utf-8")
read_csv_in = pd.read_csv(csv_file)
df = read_csv_in[['imdb_score','director_name','movie_title']].sort_values('imdb_score',ascending=False)
df.to_csv('new_imdb.csv')
#pd.read_csv(open("E:/Python3数据科学入门与实战/project/o25mso/homework/movie_metadata.csv"))[['imdb_score','director_name','movie_title']].sort_values('imdb_score',ascending=False)
!ls
csv_practice.ipynb
demo.ipynb
new_imdb.csv
practice-2018-08-11.ipynb
有中文路径时通过open("filename","r",encoding="utf-8")读进来,
再用pandas.read_csv()读。注意用utf-8读取否则会出现问题。
这样做好处主要是允许路径有中文
df.head()
|
imdb_score |
director_name |
movie_title |
2765 |
9.5 |
John Blanchard |
Towering Inferno |
1937 |
9.3 |
Frank Darabont |
The Shawshank Redemption |
3466 |
9.2 |
Francis Ford Coppola |
The Godfather |
4409 |
9.1 |
John Stockwell |
Kickboxer: Vengeance |
2824 |
9.1 |
NaN |
Dekalog |