import numpy as np
import pandas as pd
from pandas import Series, DataFrame
s1 = Series([1,2,3],index=['A','B','C'])
s1
A    1
B    2
C    3
dtype: int64
s2 = Series([4,5,6,7],index=['B','C','D','E'])
s2
B    4
C    5
D    6
E    7
dtype: int64
s1 + s2
A    NaN
B    6.0
C    8.0
D    NaN
E    NaN
dtype: float64
对应index相加,对不上的就是NaN
DataFrame的运算
df_a = DataFrame(np.arange(4).reshape(2,2),index=['A','B'],columns=['北京','上海'])
df_b = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['北京','上海','广州'])
df_a
df_b
  
    
      |  | 北京 | 上海 | 广州 | 
  
  
    
      | A | 0 | 1 | 2 | 
    
      | B | 3 | 4 | 5 | 
    
      | C | 6 | 7 | 8 | 
  
 df_a + df_b
  
    
      |  | 上海 | 北京 | 广州 | 
  
  
    
      | A | 2.0 | 0.0 | NaN | 
    
      | B | 7.0 | 5.0 | NaN | 
    
      | C | NaN | NaN | NaN | 
  
 类似的,index和columns对应的部分可以相加,否则为NaN
df_c = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],index=['A','B','C'],columns=['c1','c2','c3'])
df_c
  
    
      |  | c1 | c2 | c3 | 
  
  
    
      | A | 1 | 2 | 3.0 | 
    
      | B | 4 | 5 | NaN | 
    
      | C | 7 | 8 | 9.0 | 
  
 df_c.sum()
c1    12.0
c2    15.0
c3    12.0
dtype: float64
df_c.sum(axis = 1)
A     6.0
B     9.0
C    24.0
dtype: float64
type(df_c.sum())
pandas.core.series.Series
DataFrame中求和的时候会忽略NaN
axis = 1 可以指定行的计算
df_c.describe()
  
    
      |  | c1 | c2 | c3 | 
  
  
    
      | count | 3.0 | 3.0 | 2.000000 | 
    
      | mean | 4.0 | 5.0 | 6.000000 | 
    
      | std | 3.0 | 3.0 | 4.242641 | 
    
      | min | 1.0 | 2.0 | 3.000000 | 
    
      | 25% | 2.5 | 3.5 | 4.500000 | 
    
      | 50% | 4.0 | 5.0 | 6.000000 | 
    
      | 75% | 5.5 | 6.5 | 7.500000 | 
    
      | max | 7.0 | 8.0 | 9.000000 | 
  
 s1.index
Index(['A', 'B', 'C'], dtype='object')
s1.sort_values()#按value升序
A    1
B    2
C    3
dtype: int64
s1.sort_values(ascending=False)#按value降序
C    3
B    2
A    1
dtype: int64
s1.sort_index(ascending=False)#按index降序
C    3
B    2
A    1
dtype: int64
df_d = DataFrame(np.random.randn(35).reshape(7,5),columns=['A','B','C','D','E'])
df_d
  
    
      |  | A | B | C | D | E | 
  
  
    
      | 0 | -0.694245 | -0.302792 | 0.667865 | 0.447782 | -0.413812 | 
    
      | 1 | -0.502081 | -1.849090 | 1.885715 | -1.117864 | 0.406936 | 
    
      | 2 | 0.384877 | 0.076701 | -1.052755 | -0.709675 | 0.272562 | 
    
      | 3 | -1.194740 | -0.518320 | -0.139549 | -0.745238 | 1.270952 | 
    
      | 4 | -1.266443 | -1.163004 | -0.644873 | -0.333446 | 0.349508 | 
    
      | 5 | -0.695937 | -0.589887 | 1.475200 | 0.278659 | 2.207159 | 
    
      | 6 | -0.712247 | 0.171372 | 0.268192 | 0.138490 | 0.604858 | 
  
 df_d.sort_values('A',ascending=False)#按A列降序
  
    
      |  | A | B | C | D | E | 
  
  
    
      | 2 | 0.384877 | 0.076701 | -1.052755 | -0.709675 | 0.272562 | 
    
      | 1 | -0.502081 | -1.849090 | 1.885715 | -1.117864 | 0.406936 | 
    
      | 0 | -0.694245 | -0.302792 | 0.667865 | 0.447782 | -0.413812 | 
    
      | 5 | -0.695937 | -0.589887 | 1.475200 | 0.278659 | 2.207159 | 
    
      | 6 | -0.712247 | 0.171372 | 0.268192 | 0.138490 | 0.604858 | 
    
      | 3 | -1.194740 | -0.518320 | -0.139549 | -0.745238 | 1.270952 | 
    
      | 4 | -1.266443 | -1.163004 | -0.644873 | -0.333446 | 0.349508 | 
  
 df_d.sort_index(ascending=False)#按index降序
  
    
      |  | A | B | C | D | E | 
  
  
    
      | 6 | -0.712247 | 0.171372 | 0.268192 | 0.138490 | 0.604858 | 
    
      | 5 | -0.695937 | -0.589887 | 1.475200 | 0.278659 | 2.207159 | 
    
      | 4 | -1.266443 | -1.163004 | -0.644873 | -0.333446 | 0.349508 | 
    
      | 3 | -1.194740 | -0.518320 | -0.139549 | -0.745238 | 1.270952 | 
    
      | 2 | 0.384877 | 0.076701 | -1.052755 | -0.709675 | 0.272562 | 
    
      | 1 | -0.502081 | -1.849090 | 1.885715 | -1.117864 | 0.406936 | 
    
      | 0 | -0.694245 | -0.302792 | 0.667865 | 0.447782 | -0.413812 | 
  
 所谓课后练习 ——> 将一个已有csv文件中的三列提取出来,按其中一项降序排列,要求用一行完成(真是。。)。
import pandas as pd
csv_file = open("E:/Python3数据科学入门与实战/project/o25mso/homework/movie_metadata.csv","r",encoding="utf-8")
read_csv_in = pd.read_csv(csv_file)
df = read_csv_in[['imdb_score','director_name','movie_title']].sort_values('imdb_score',ascending=False)
df.to_csv('new_imdb.csv')
#pd.read_csv(open("E:/Python3数据科学入门与实战/project/o25mso/homework/movie_metadata.csv"))[['imdb_score','director_name','movie_title']].sort_values('imdb_score',ascending=False)
!ls
csv_practice.ipynb
demo.ipynb
new_imdb.csv
practice-2018-08-11.ipynb
有中文路径时通过open("filename","r",encoding="utf-8")读进来,
再用pandas.read_csv()读。注意用utf-8读取否则会出现问题。
这样做好处主要是允许路径有中文
df.head()
  
    
      |  | imdb_score | director_name | movie_title | 
  
  
    
      | 2765 | 9.5 | John Blanchard | Towering Inferno | 
    
      | 1937 | 9.3 | Frank Darabont | The Shawshank Redemption | 
    
      | 3466 | 9.2 | Francis Ford Coppola | The Godfather | 
    
      | 4409 | 9.1 | John Stockwell | Kickboxer: Vengeance | 
    
      | 2824 | 9.1 | NaN | Dekalog |