기계학습_pandas basic

2주차_ pandas 기초 문법

Posted by iheese on September 08, 2021 · 54 mins read


import numpy as np
import pandas as pd


object creation


s=pd.Series([1,3,5,np.nan,6,8])
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


dates=pd.date_range("20130101",periods=6)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


df= pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df
A B C D
2013-01-01 -1.171673 -1.282048 -1.240960 -1.382591
2013-01-02 0.747014 -0.098745 -2.674977 -0.766007
2013-01-03 0.344874 -1.071516 0.138593 0.563455
2013-01-04 0.032506 0.100209 -0.258473 -1.114725
2013-01-05 0.141728 2.327885 1.172082 1.085766
2013-01-06 -1.335232 -1.043389 2.569405 1.850168


df2= pd.DataFrame({"A":1.0,"B":pd.Timestamp("20130102"),
                   "C":pd.Series(1,index=list(range(4)),dtype="float32"),
                   "D":np.array([3]*4,dtype="int32"),
                   "E":pd.Categorical(["test","train","test","train"]),
                   "F":"foo",})
df2
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo


df2.dtypes
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


df.head()
A B C D
2013-01-01 -1.171673 -1.282048 -1.240960 -1.382591
2013-01-02 0.747014 -0.098745 -2.674977 -0.766007
2013-01-03 0.344874 -1.071516 0.138593 0.563455
2013-01-04 0.032506 0.100209 -0.258473 -1.114725
2013-01-05 0.141728 2.327885 1.172082 1.085766


df.tail()
A B C D
2013-01-02 0.747014 -0.098745 -2.674977 -0.766007
2013-01-03 0.344874 -1.071516 0.138593 0.563455
2013-01-04 0.032506 0.100209 -0.258473 -1.114725
2013-01-05 0.141728 2.327885 1.172082 1.085766
2013-01-06 -1.335232 -1.043389 2.569405 1.850168


df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')


df.to_numpy()
array([[ 1.59342018,  2.65239188,  0.49500638, -0.73513749],
       [ 0.56375966, -2.66504831,  0.37451522, -1.10341922],
       [-0.2827687 ,  1.07071856, -0.50046244, -0.48868478],
       [-1.66030333, -0.08691766,  0.55157115, -0.26135908],
       [-0.40783401, -0.83482756, -1.80988001, -1.30816714],
       [ 0.22261522, -1.87903334,  2.08030501, -0.16128424]])


df2.to_numpy()
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)


df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.004815 -0.290453 0.198509 -0.676342
std 1.087118 1.950035 1.288944 0.459949
min -1.660303 -2.665048 -1.809880 -1.308167
25% -0.376568 -1.617982 -0.281718 -1.011349
50% -0.030077 -0.460873 0.434761 -0.611911
75% 0.478474 0.781310 0.537430 -0.318191
max 1.593420 2.652392 2.080305 -0.161284


df.T
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A 1.593420 0.563760 -0.282769 -1.660303 -0.407834 0.222615
B 2.652392 -2.665048 1.070719 -0.086918 -0.834828 -1.879033
C 0.495006 0.374515 -0.500462 0.551571 -1.809880 2.080305
D -0.735137 -1.103419 -0.488685 -0.261359 -1.308167 -0.161284


df.sort_values(by='B') 
A B C D
2013-01-02 0.563760 -2.665048 0.374515 -1.103419
2013-01-06 0.222615 -1.879033 2.080305 -0.161284
2013-01-05 -0.407834 -0.834828 -1.809880 -1.308167
2013-01-04 -1.660303 -0.086918 0.551571 -0.261359
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685
2013-01-01 1.593420 2.652392 0.495006 -0.735137


Selection


df['A'] #Getting
2013-01-01    1.593420
2013-01-02    0.563760
2013-01-03   -0.282769
2013-01-04   -1.660303
2013-01-05   -0.407834
2013-01-06    0.222615
Freq: D, Name: A, dtype: float64


df[0:3] #slice the rows
A B C D
2013-01-01 1.593420 2.652392 0.495006 -0.735137
2013-01-02 0.563760 -2.665048 0.374515 -1.103419
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685


df["20130102":"20130104"]
A B C D
2013-01-02 0.563760 -2.665048 0.374515 -1.103419
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685
2013-01-04 -1.660303 -0.086918 0.551571 -0.261359


df.loc[dates[0]]
A    1.593420
B    2.652392
C    0.495006
D   -0.735137
Name: 2013-01-01 00:00:00, dtype: float64


df.loc[:,["A","B"]] #Selection by Label
A B
2013-01-01 1.593420 2.652392
2013-01-02 0.563760 -2.665048
2013-01-03 -0.282769 1.070719
2013-01-04 -1.660303 -0.086918
2013-01-05 -0.407834 -0.834828
2013-01-06 0.222615 -1.879033


df.loc["20130102":"20130104",["A","B"]]
A B
2013-01-02 0.563760 -2.665048
2013-01-03 -0.282769 1.070719
2013-01-04 -1.660303 -0.086918


df.loc["20130102", ["A","B"]]
A    0.563760
B   -2.665048
Name: 2013-01-02 00:00:00, dtype: float64


df.loc[dates[0],"A"]
1.5934201795971803


df.at[dates[0],"A"]
1.5934201795971803


Selection by position


df.iloc[3]
A   -1.660303
B   -0.086918
C    0.551571
D   -0.261359
Name: 2013-01-04 00:00:00, dtype: float64


df.iloc[3:5,0:2]
A B
2013-01-04 -1.660303 -0.086918
2013-01-05 -0.407834 -0.834828


df.iloc[[1,2,4],[0,2]]
A C
2013-01-02 0.563760 0.374515
2013-01-03 -0.282769 -0.500462
2013-01-05 -0.407834 -1.809880


df.iloc[1:3,:]
A B C D
2013-01-02 0.563760 -2.665048 0.374515 -1.103419
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685


df.iloc[:,1:3]
B C
2013-01-01 2.652392 0.495006
2013-01-02 -2.665048 0.374515
2013-01-03 1.070719 -0.500462
2013-01-04 -0.086918 0.551571
2013-01-05 -0.834828 -1.809880
2013-01-06 -1.879033 2.080305


df.iloc[1,1]
-2.6650483125361175


df.iat[1,1] #equivalent to the prior method
-2.6650483125361175


Boolean indexing


df[df["A"]>0]
A B C D
2013-01-01 1.593420 2.652392 0.495006 -0.735137
2013-01-02 0.563760 -2.665048 0.374515 -1.103419
2013-01-06 0.222615 -1.879033 2.080305 -0.161284


df[df>0]
A B C D
2013-01-01 1.593420 2.652392 0.495006 NaN
2013-01-02 0.563760 NaN 0.374515 NaN
2013-01-03 NaN 1.070719 NaN NaN
2013-01-04 NaN NaN 0.551571 NaN
2013-01-05 NaN NaN NaN NaN
2013-01-06 0.222615 NaN 2.080305 NaN


df2=df.copy() #make same DataFrame
df2["E"]=["one","one","two","three","four","three"]
df2
A B C D E
2013-01-01 1.593420 2.652392 0.495006 -0.735137 one
2013-01-02 0.563760 -2.665048 0.374515 -1.103419 one
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685 two
2013-01-04 -1.660303 -0.086918 0.551571 -0.261359 three
2013-01-05 -0.407834 -0.834828 -1.809880 -1.308167 four
2013-01-06 0.222615 -1.879033 2.080305 -0.161284 three


df2[df2["E"].isin(["two","four"])] #isin() method for filtering
A B C D E
2013-01-03 -0.282769 1.070719 -0.500462 -0.488685 two
2013-01-05 -0.407834 -0.834828 -1.809880 -1.308167 four


Setting


s1=pd.Series([1,2,3,4,5,6],index=pd.date_range("20130102",periods=6))
s1
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64


df['F']=s1
df
A B C D F
2013-01-01 -1.171673 -1.282048 -1.240960 -1.382591 NaN
2013-01-02 0.747014 -0.098745 -2.674977 -0.766007 1.0
2013-01-03 0.344874 -1.071516 0.138593 0.563455 2.0
2013-01-04 0.032506 0.100209 -0.258473 -1.114725 3.0
2013-01-05 0.141728 2.327885 1.172082 1.085766 4.0
2013-01-06 -1.335232 -1.043389 2.569405 1.850168 5.0


df.at[dates[0],"A"]=0


df.iat[0,1]=0


df.loc[:,"D"]=np.array([5]*len(df))


df
A B C D F
2013-01-01 0.000000 0.000000 -1.240960 5 NaN
2013-01-02 0.747014 -0.098745 -2.674977 5 1.0
2013-01-03 0.344874 -1.071516 0.138593 5 2.0
2013-01-04 0.032506 0.100209 -0.258473 5 3.0
2013-01-05 0.141728 2.327885 1.172082 5 4.0
2013-01-06 -1.335232 -1.043389 2.569405 5 5.0


df2=df.copy()
df2[df2>0] =-df2
df2
A B C D F
2013-01-01 0.000000 0.000000 -1.240960 -5 NaN
2013-01-02 -0.747014 -0.098745 -2.674977 -5 -1.0
2013-01-03 -0.344874 -1.071516 -0.138593 -5 -2.0
2013-01-04 -0.032506 -0.100209 -0.258473 -5 -3.0
2013-01-05 -0.141728 -2.327885 -1.172082 -5 -4.0
2013-01-06 -1.335232 -1.043389 -2.569405 -5 -5.0


Missing data


df1=df.reindex(index=dates[0:4],columns=list(df.columns)+["E"])
df1.loc[dates[0]:dates[1],"E"]=1
df1   #reindexing: to change,add,delete the index on a specified axis
A B C D F E
2013-01-01 0.000000 0.000000 -1.240960 5 NaN 1.0
2013-01-02 0.747014 -0.098745 -2.674977 5 1.0 1.0
2013-01-03 0.344874 -1.071516 0.138593 5 2.0 NaN
2013-01-04 0.032506 0.100209 -0.258473 5 3.0 NaN


df1.dropna(how="any") #drop any rows that have missing data
A B C D F E
2013-01-02 0.747014 -0.098745 -2.674977 5 1.0 1.0


df1.fillna(value=5)
A B C D F E
2013-01-01 0.000000 0.000000 -1.240960 5 5.0 1.0
2013-01-02 0.747014 -0.098745 -2.674977 5 1.0 1.0
2013-01-03 0.344874 -1.071516 0.138593 5 2.0 5.0
2013-01-04 0.032506 0.100209 -0.258473 5 3.0 5.0


pd.isna(df1)
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True


Operations


df.mean() #stats
A   -0.011518
B    0.035741
C   -0.049055
D    5.000000
F    3.000000
dtype: float64


df.mean(1) #axis=1,row
2013-01-01    0.939760
2013-01-02    0.794659
2013-01-03    1.282390
2013-01-04    1.574848
2013-01-05    2.528339
2013-01-06    2.038157
Freq: D, dtype: float64


df.mean(0) #axis=0,columns
A   -0.011518
B    0.035741
C   -0.049055
D    5.000000
F    3.000000
dtype: float64


s=pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) #shft: move forward 2
s
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64


df.sub(s,axis="index")
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -0.655126 -2.071516 -0.861407 4.0 1.0
2013-01-04 -2.967494 -2.899791 -3.258473 2.0 0.0
2013-01-05 -4.858272 -2.672115 -3.827918 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN


df
A B C D F
2013-01-01 0.000000 0.000000 -1.240960 5 NaN
2013-01-02 0.747014 -0.098745 -2.674977 5 1.0
2013-01-03 0.344874 -1.071516 0.138593 5 2.0
2013-01-04 0.032506 0.100209 -0.258473 5 3.0
2013-01-05 0.141728 2.327885 1.172082 5 4.0
2013-01-06 -1.335232 -1.043389 2.569405 5 5.0


df.apply(np.cumsum) #cumulative
A B C D F
2013-01-01 0.000000 0.000000 -1.240960 5 NaN
2013-01-02 0.747014 -0.098745 -3.915937 10 1.0
2013-01-03 1.091887 -1.170260 -3.777344 15 3.0
2013-01-04 1.124393 -1.070051 -4.035818 20 6.0
2013-01-05 1.266121 1.257834 -2.863735 25 10.0
2013-01-06 -0.069110 0.214444 -0.294330 30 15.0


df.apply(lambda x: x.max() -x.min()) #apply
A    2.082246
B    3.399400
C    5.244382
D    0.000000
F    4.000000
dtype: float64


s=pd.Series(np.random.randint(0,7,size=10)) #histogramming
s
0    5
1    0
2    5
3    5
4    3
5    1
6    4
7    3
8    6
9    3
dtype: int32


s.value_counts().sort_index() #add sort_index()
0    1
1    1
3    3
4    1
5    3
6    1
dtype: int64


String Methods


s=pd.Series(["A","B","C","Aaba","Baca",np.nan,"CABA","dog","cat"])
s.str.lower()
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object


Merge


df=pd.DataFrame(np.random.randn(10,4)) #concat
df
0 1 2 3
0 -0.025866 -0.178967 -0.808695 0.927172
1 -0.842495 0.707233 0.222576 0.455422
2 -0.328742 -1.270708 -0.508805 -0.018087
3 1.272699 -1.671005 -1.107406 0.243692
4 -1.867842 0.893841 -1.515523 -1.106920
5 -0.588635 -0.480732 0.034501 0.348179
6 0.270778 0.133031 -1.371957 -2.978966
7 0.650808 0.742551 -2.189231 -0.465399
8 -0.919177 1.063680 -0.075367 -0.147211
9 -0.288339 -0.125634 0.758688 -0.080836


pieces=[df[:3],df[3:7],df[7:]]

pd.concat(pieces)
0 1 2 3
0 -0.025866 -0.178967 -0.808695 0.927172
1 -0.842495 0.707233 0.222576 0.455422
2 -0.328742 -1.270708 -0.508805 -0.018087
3 1.272699 -1.671005 -1.107406 0.243692
4 -1.867842 0.893841 -1.515523 -1.106920
5 -0.588635 -0.480732 0.034501 0.348179
6 0.270778 0.133031 -1.371957 -2.978966
7 0.650808 0.742551 -2.189231 -0.465399
8 -0.919177 1.063680 -0.075367 -0.147211
9 -0.288339 -0.125634 0.758688 -0.080836


left=pd.DataFrame({"key": ["foo","foo"],"lval":[1,2]})
right=pd.DataFrame({"key": ["foo","foo"],"rval":[4,5]})
print(left)
print(right)
   key  lval
0  foo     1
1  foo     2
   key  rval
0  foo     4
1  foo     5


pd.merge(left,right,on="key") #SQL style merge
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5


left=pd.DataFrame({"key": ["foo","bar"],"lval":[1,2]})
right=pd.DataFrame({"key": ["foo","bar"],"rval":[4,5]})
print(left)
print(right)
   key  lval
0  foo     1
1  bar     2
   key  rval
0  foo     4
1  bar     5


pd.merge(left,right, on="key")
key lval rval
0 foo 1 4
1 bar 2 5


Grouping


df=pd.DataFrame({"A":["foo","bar","foo","bar","foo","bar","foo","foo"],
                "B":["one","one","two","three","two","two","one","three"],
                "C": np.random.randn(8),
                "D": np.random.randn(8)
                }
               )
df
A B C D
0 foo one 1.048840 -0.833822
1 bar one -1.137147 -2.027540
2 foo two -0.657997 0.608757
3 bar three -0.413567 -0.753453
4 foo two -0.454805 -1.496538
5 bar two 0.493420 -0.384181
6 foo one -0.561297 1.679049
7 foo three 1.841777 0.482405


df.groupby("A").sum()
C D
A
bar -1.057294 -3.165174
foo 1.216518 0.439850


df.groupby(["A","B"]).sum()
C D
A B
bar one -1.137147 -2.027540
three -0.413567 -0.753453
two 0.493420 -0.384181
foo one 0.487543 0.845226
three 1.841777 0.482405
two -1.112803 -0.887780


Reshaping


tuples = list(
        zip(
            *[
              ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
              ["one", "two", "one", "two", "one", "two", "one", "two"],
          ]
       )
   )
index= pd.MultiIndex.from_tuples(tuples,names=["first","second"])
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=["A","B"])
df2=df[:4]
df2
A B
first second
bar one 0.890952 -0.375587
two 1.001416 -1.333297
baz one -0.562930 -0.951540
two 0.273894 -0.832586


stacked=df2.stack()
stacked  #compresse a level in DataFrame's columns

first  second   
bar    one     A    0.890952
               B   -0.375587
       two     A    1.001416
               B   -1.333297
baz    one     A   -0.562930
               B   -0.951540
       two     A    0.273894
               B   -0.832586
dtype: float64


stacked.unstack() # inverse operation of stack()
A B
first second
bar one 0.890952 -0.375587
two 1.001416 -1.333297
baz one -0.562930 -0.951540
two 0.273894 -0.832586


print(stacked.unstack(0))
print(stacked.unstack(1))
first          bar       baz
second                      
one    A  0.890952 -0.562930
       B -0.375587 -0.951540
two    A  1.001416  0.273894
       B -1.333297 -0.832586
second        one       two
first                      
bar   A  0.890952  1.001416
      B -0.375587 -1.333297
baz   A -0.562930  0.273894
      B -0.951540 -0.832586


Pivot tables


df=pd.DataFrame({
    "A": ["one", "one", "two", "three"] * 3,
    "B": ["A", "B", "C"] * 4,
    "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
    "D": np.random.randn(12),
    "E": np.random.randn(12),
}                
)
df
A B C D E
0 one A foo -0.373453 -1.406135
1 one B foo 1.264894 -1.389622
2 two C foo -0.242048 -1.593581
3 three A bar 2.716601 1.612520
4 one B bar -1.528681 0.602288
5 one C bar -0.074750 2.044902
6 two A foo 0.110851 -0.740685
7 three B foo -0.915239 0.589600
8 one C foo 0.377855 -0.094179
9 one A bar -0.965858 -1.770097
10 two B bar -0.943708 0.623095
11 three C bar -0.096407 -0.637673


pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])
C bar foo
A B
one A -0.965858 -0.373453
B -1.528681 1.264894
C -0.074750 0.377855
three A 2.716601 NaN
B NaN -0.915239
C -0.096407 NaN
two A NaN 0.110851
B -0.943708 NaN
C NaN -0.242048


Time series


rng = pd.date_range("1/1/2012", periods=100, freq="S")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample("5Min").sum() #sum every 5 mins
2012-01-01    26705
Freq: 5T, dtype: int32


ts.resample("1Min").sum() #sum every 1 min
2012-01-01 00:00:00    15006
2012-01-01 00:01:00    11699
Freq: T, dtype: int32


rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2012-03-06    1.385727
2012-03-07   -1.201650
2012-03-08   -0.135337
2012-03-09   -0.287460
2012-03-10    0.454925
Freq: D, dtype: float64


ts_utc = ts.tz_localize("UTC")
ts_utc
2012-03-06 00:00:00+00:00    1.385727
2012-03-07 00:00:00+00:00   -1.201650
2012-03-08 00:00:00+00:00   -0.135337
2012-03-09 00:00:00+00:00   -0.287460
2012-03-10 00:00:00+00:00    0.454925
Freq: D, dtype: float64


ts_utc.tz_convert("US/Eastern") #covertion to another time zone
2012-03-05 19:00:00-05:00    1.385727
2012-03-06 19:00:00-05:00   -1.201650
2012-03-07 19:00:00-05:00   -0.135337
2012-03-08 19:00:00-05:00   -0.287460
2012-03-09 19:00:00-05:00    0.454925
Freq: D, dtype: float64


rng = pd.date_range("1/1/2012", periods=5, freq="M")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-01-31    0.252942
2012-02-29   -0.028175
2012-03-31   -0.639577
2012-04-30   -2.170122
2012-05-31    0.290485
Freq: M, dtype: float64


ps=ts.to_period()
ps
2012-01    0.252942
2012-02   -0.028175
2012-03   -0.639577
2012-04   -2.170122
2012-05    0.290485
Freq: M, dtype: float64


ps.to_timestamp()
2012-01-01    0.252942
2012-02-01   -0.028175
2012-03-01   -0.639577
2012-04-01   -2.170122
2012-05-01    0.290485
Freq: MS, dtype: float64


prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV")
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts.head()
1990-03-01 09:00    0.747117
1990-06-01 09:00    1.203087
1990-09-01 09:00   -0.941095
1990-12-01 09:00   -0.126587
1991-03-01 09:00   -1.061227
Freq: H, dtype: float64


Categoricals


df = pd.DataFrame(
         {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} 
)
df
id raw_grade
0 1 a
1 2 b
2 3 b
3 4 a
4 5 a
5 6 e


df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']


df["grade"]=df["grade"].cat.set_categories(["very good", "good", "very bad"])
df["grade"] #rename the categories
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']


df.sort_values(by="grade")
id raw_grade grade
0 1 a NaN
1 2 b NaN
2 3 b NaN
3 4 a NaN
4 5 a NaN
5 6 e NaN


df.groupby("grade").size()
grade
very good    0
good         0
very bad     0
dtype: int64


Plotting


import matplotlib.pyplot as plt
plt.close("all") #close a figure window


ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))
ts = ts.cumsum()
ts.plot();

plot1


df=pd.DataFrame(
                np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"]
)


df=df.cumsum()


plt.figure();
<Figure size 432x288 with 0 Axes>


df.plot();
plt.legend(loc='best');

plot2


Getting data in/out (csv)


df.to_csv("foo.csv") #writing to c csv file


pd.read_csv("foo.csv") #reading from a csv file
Unnamed: 0 A B C D
0 2000-01-01 1.783768 2.045073 -0.507718 1.123060
1 2000-01-02 0.196039 2.711749 0.416838 1.080363
2 2000-01-03 1.208280 1.600025 0.633446 1.539429
3 2000-01-04 2.404634 3.237997 0.450288 0.621748
4 2000-01-05 1.188598 3.202251 1.098251 1.717754
... ... ... ... ... ...
995 2002-09-22 -9.342127 24.392143 10.564637 -10.164637
996 2002-09-23 -8.419079 26.097426 8.878737 -9.608197
997 2002-09-24 -7.336001 25.781776 10.905896 -10.742856
998 2002-09-25 -7.736148 25.178845 11.460886 -10.487685
999 2002-09-26 -6.694487 24.414987 12.606476 -9.257457

1000 rows × 5 columns


HDF5


df.to_hdf("foo.h5", "df")


pd.read_hdf("foo.h5", "df")


Excel


df.to_excel("foo.xlsx", sheet_name="Sheet1")


pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])


Gotchas


if pd.Series([False, True, False]):
    print("I was True")
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

~\AppData\Local\Temp/ipykernel_12628/2697453150.py in <module>
----> 1 if pd.Series([False, True, False]):
      2     print("I was True")


~\miniconda3\envs\cal2020\lib\site-packages\pandas\core\generic.py in __nonzero__(self)
   1536     def __nonzero__(self):
   1537         raise ValueError(
-> 1538             f"The truth value of a {type(self).__name__} is ambiguous. "
   1539             "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
   1540         )


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


if pd.Series([False, True, False]) is not None:
    print("I was True")
I was True


if pd.Series([False, False, False]) is not None:
    print("I was True")  <BR/>

##None is not definition
I was True


Reference: