Pandas Library


Pandas
In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
# One-dimensional ndarray with axis labels (including time series).

s = pd.Series([1,3,5,np.nan,6,8])

s
Out[2]:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
In [3]:
#Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).
#Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. 
#The primary pandas data structure


dates = pd.date_range('20130101', periods =6)

dates

df = pd.DataFrame(np.random.randn(6,4)*10, index=dates, columns = list ('ABCD'))

df
Out[3]:
A B C D
2013-01-01 5.701238 6.855059 7.280012 -7.749124
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545
2013-01-05 9.149365 1.138663 14.292277 7.215540
2013-01-06 1.037223 -11.665085 0.826159 11.337846
In [4]:
#Creating a DataFrame by passing a dict of objects that can be converted to series-like.

df2 = pd.DataFrame ({'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1,index=list(range(4)),dtype = 'float32'),
                     'D': np.array ([3] *4, dtype = 'int32'),
                     'E': pd.Categorical(["test", "train", "test", "train"]),
                     'F': 'foo'})

df2
Out[4]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [5]:
df2.dtypes
Out[5]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
In [6]:
#Subset of the attributes

df2.<TAB>
  File "<ipython-input-6-f3c61b116133>", line 3
    df2.<TAB>
        ^
SyntaxError: invalid syntax
In [8]:
df.head()
Out[8]:
A B C D
2013-01-01 5.701238 6.855059 7.280012 -7.749124
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545
2013-01-05 9.149365 1.138663 14.292277 7.215540
In [9]:
df.tail(3)
Out[9]:
A B C D
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545
2013-01-05 9.149365 1.138663 14.292277 7.215540
2013-01-06 1.037223 -11.665085 0.826159 11.337846
In [10]:
df.index
Out[10]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [11]:
df.columns
Out[11]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [12]:
df.values
Out[12]:
array([[  5.7012378 ,   6.85505883,   7.28001198,  -7.7491236 ],
       [  7.70938659,  15.03031926, -12.74426791,   5.10285248],
       [ -1.80135618,   3.93479393,  -5.57257698,  -3.97035788],
       [  3.50209395,  -2.08442948, -16.14170628,  -5.25054536],
       [  9.14936458,   1.13866273,  14.29227682,   7.21553986],
       [  1.03722287, -11.66508526,   0.82615942,  11.33784591]])
In [13]:
# Describe shows a quick statistic summary of your data

df.describe()
Out[13]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 4.216325 2.201553 -2.010017 1.114369
std 4.136922 8.955663 11.728034 7.779201
min -1.801356 -11.665085 -16.141706 -7.749124
25% 1.653441 -1.278656 -10.951345 -4.930498
50% 4.601666 2.536728 -2.373209 0.566247
75% 7.207349 6.124993 5.666549 6.687368
max 9.149365 15.030319 14.292277 11.337846
In [14]:
# Transposing your data

df.T
Out[14]:
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A 5.701238 7.709387 -1.801356 3.502094 9.149365 1.037223
B 6.855059 15.030319 3.934794 -2.084429 1.138663 -11.665085
C 7.280012 -12.744268 -5.572577 -16.141706 14.292277 0.826159
D -7.749124 5.102852 -3.970358 -5.250545 7.215540 11.337846
In [15]:
# Sorting by an axis
df.sort_index(axis=1, ascending=False)
Out[15]:
D C B A
2013-01-01 -7.749124 7.280012 6.855059 5.701238
2013-01-02 5.102852 -12.744268 15.030319 7.709387
2013-01-03 -3.970358 -5.572577 3.934794 -1.801356
2013-01-04 -5.250545 -16.141706 -2.084429 3.502094
2013-01-05 7.215540 14.292277 1.138663 9.149365
2013-01-06 11.337846 0.826159 -11.665085 1.037223
In [16]:
 df.sort_values(by='B')
Out[16]:
A B C D
2013-01-06 1.037223 -11.665085 0.826159 11.337846
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545
2013-01-05 9.149365 1.138663 14.292277 7.215540
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
2013-01-01 5.701238 6.855059 7.280012 -7.749124
2013-01-02 7.709387 15.030319 -12.744268 5.102852
In [17]:
df['A']
Out[17]:
2013-01-01    5.701238
2013-01-02    7.709387
2013-01-03   -1.801356
2013-01-04    3.502094
2013-01-05    9.149365
2013-01-06    1.037223
Freq: D, Name: A, dtype: float64
In [18]:
df[0:3]
Out[18]:
A B C D
2013-01-01 5.701238 6.855059 7.280012 -7.749124
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
In [19]:
df['20130102':'20130104']
Out[19]:
A B C D
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545
In [20]:
#For getting a cross section using a label

df.loc[dates[0]]
Out[20]:
A    5.701238
B    6.855059
C    7.280012
D   -7.749124
Name: 2013-01-01 00:00:00, dtype: float64
In [21]:
df.loc[:,['A', 'B']]
Out[21]:
A B
2013-01-01 5.701238 6.855059
2013-01-02 7.709387 15.030319
2013-01-03 -1.801356 3.934794
2013-01-04 3.502094 -2.084429
2013-01-05 9.149365 1.138663
2013-01-06 1.037223 -11.665085
In [22]:
df.loc ['20130102':'20130104', ['A', 'B']]
Out[22]:
A B
2013-01-02 7.709387 15.030319
2013-01-03 -1.801356 3.934794
2013-01-04 3.502094 -2.084429
In [23]:
df.loc['20130102',['A','B']]
Out[23]:
A     7.709387
B    15.030319
Name: 2013-01-02 00:00:00, dtype: float64
In [24]:
df.loc[dates[0], 'A']
Out[24]:
5.7012378049110328
In [25]:
df.at[dates[0],'A']
Out[25]:
5.7012378049110328
In [26]:
#Select via the position of the passed integers

df.iloc[3]
Out[26]:
A     3.502094
B    -2.084429
C   -16.141706
D    -5.250545
Name: 2013-01-04 00:00:00, dtype: float64
In [27]:
df.iloc[3:5,0:2]
Out[27]:
A B
2013-01-04 3.502094 -2.084429
2013-01-05 9.149365 1.138663
In [28]:
df.iloc[[1,2,4],[0,2]]
Out[28]:
A C
2013-01-02 7.709387 -12.744268
2013-01-03 -1.801356 -5.572577
2013-01-05 9.149365 14.292277
In [29]:
df.iloc[1:3,:]
Out[29]:
A B C D
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358
In [30]:
df.iloc[1,1]
Out[30]:
15.030319261378509
In [31]:
df.iat[1,1]
Out[31]:
15.030319261378509
In [33]:
#Boolean Indexing

#Using a single column’s values to select data.

df[df.A > 5]
Out[33]:
A B C D
2013-01-01 5.701238 6.855059 7.280012 -7.749124
2013-01-02 7.709387 15.030319 -12.744268 5.102852
2013-01-05 9.149365 1.138663 14.292277 7.215540
In [34]:
# Selecting values from a DataFrame where a boolean condition is met.

df[df > 5]
Out[34]:
A B C D
2013-01-01 5.701238 6.855059 7.280012 NaN
2013-01-02 7.709387 15.030319 NaN 5.102852
2013-01-03 NaN NaN NaN NaN
2013-01-04 NaN NaN NaN NaN
2013-01-05 9.149365 NaN 14.292277 7.215540
2013-01-06 NaN NaN NaN 11.337846
In [36]:
# Using the isin() method for filtering:

df2 = df.copy()

df2['E'] = ['one', 'two', 'two', 'three', 'four', 'three']

df2
Out[36]:
A B C D E
2013-01-01 5.701238 6.855059 7.280012 -7.749124 one
2013-01-02 7.709387 15.030319 -12.744268 5.102852 two
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358 two
2013-01-04 3.502094 -2.084429 -16.141706 -5.250545 three
2013-01-05 9.149365 1.138663 14.292277 7.215540 four
2013-01-06 1.037223 -11.665085 0.826159 11.337846 three
In [37]:
df2[df2['E'].isin(['two','four'])]
Out[37]:
A B C D E
2013-01-02 7.709387 15.030319 -12.744268 5.102852 two
2013-01-03 -1.801356 3.934794 -5.572577 -3.970358 two
2013-01-05 9.149365 1.138663 14.292277 7.215540 four
In [38]:
#Setting a new column automatically aligns the data by the indexes

s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods= 6))

s1
Out[38]:
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
In [39]:
df['F'] = s1
In [40]:
# Setting values by label

df.at[dates[0],'A'] = 0
In [41]:
# Setting values by position

df.iat[0,1] =0
In [42]:
# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array ([5] * len(df))
In [43]:
df
Out[43]:
A B C D F
2013-01-01 0.000000 0.000000 7.280012 5 NaN
2013-01-02 7.709387 15.030319 -12.744268 5 1.0
2013-01-03 -1.801356 3.934794 -5.572577 5 2.0
2013-01-04 3.502094 -2.084429 -16.141706 5 3.0
2013-01-05 9.149365 1.138663 14.292277 5 4.0
2013-01-06 1.037223 -11.665085 0.826159 5 5.0
In [44]:
# A where operation with setting

df2 = df.copy()

df2[df2 > 0]= -df2

df2
Out[44]:
A B C D F
2013-01-01 0.000000 0.000000 -7.280012 -5 NaN
2013-01-02 -7.709387 -15.030319 -12.744268 -5 -1.0
2013-01-03 -1.801356 -3.934794 -5.572577 -5 -2.0
2013-01-04 -3.502094 -2.084429 -16.141706 -5 -3.0
2013-01-05 -9.149365 -1.138663 -14.292277 -5 -4.0
2013-01-06 -1.037223 -11.665085 -0.826159 -5 -5.0
In [46]:
# Reindexing allows you to change/add/delete the index on a specified axis.
# This returns a copy of the data.

df1 = df.reindex(index=dates[0:4], columns = list(df.columns) + ['E'])

df1.loc[dates[0]:dates[2], 'E'] = 1

df1
Out[46]:
A B C D F E
2013-01-01 0.000000 0.000000 7.280012 5 NaN 1.0
2013-01-02 7.709387 15.030319 -12.744268 5 1.0 1.0
2013-01-03 -1.801356 3.934794 -5.572577 5 2.0 1.0
2013-01-04 3.502094 -2.084429 -16.141706 5 3.0 NaN
In [47]:
# To drop any rows that have missing data.

df1.dropna(how='any')
Out[47]:
A B C D F E
2013-01-02 7.709387 15.030319 -12.744268 5 1.0 1.0
2013-01-03 -1.801356 3.934794 -5.572577 5 2.0 1.0
In [48]:
# Filling missing data

df1.fillna(value=6)
Out[48]:
A B C D F E
2013-01-01 0.000000 0.000000 7.280012 5 6.0 1.0
2013-01-02 7.709387 15.030319 -12.744268 5 1.0 1.0
2013-01-03 -1.801356 3.934794 -5.572577 5 2.0 1.0
2013-01-04 3.502094 -2.084429 -16.141706 5 3.0 6.0
In [50]:
# To get the boolean mask where values are nan

pd.isnull(df1)
Out[50]:
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False False
2013-01-04 False False False False False True
In [51]:
df.mean ()
Out[51]:
A    3.266119
B    1.059044
C   -2.010017
D    5.000000
F    3.000000
dtype: float64
In [52]:
df.mean(1)
Out[52]:
2013-01-01    3.070003
2013-01-02    3.199088
2013-01-03    0.712172
2013-01-04   -1.344808
2013-01-05    6.716061
2013-01-06    0.039659
Freq: D, dtype: float64
In [56]:
# Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.

s = pd.Series([1,3,5,np.nan,6,8], index= dates).shift(1)

s
Out[56]:
2013-01-01    NaN
2013-01-02    1.0
2013-01-03    3.0
2013-01-04    5.0
2013-01-05    NaN
2013-01-06    6.0
Freq: D, dtype: float64
In [57]:
df.sub(s,axis='index')
Out[57]:
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 6.709387 14.030319 -13.744268 4.0 0.0
2013-01-03 -4.801356 0.934794 -8.572577 2.0 -1.0
2013-01-04 -1.497906 -7.084429 -21.141706 0.0 -2.0
2013-01-05 NaN NaN NaN NaN NaN
2013-01-06 -4.962777 -17.665085 -5.173841 -1.0 -1.0
In [58]:
#Applying functions to the data

df.apply(np.cumsum)
Out[58]:
A B C D F
2013-01-01 0.000000 0.000000 7.280012 5 NaN
2013-01-02 7.709387 15.030319 -5.464256 10 1.0
2013-01-03 5.908030 18.965113 -11.036833 15 3.0
2013-01-04 9.410124 16.880684 -27.178539 20 6.0
2013-01-05 18.559489 18.019346 -12.886262 25 10.0
2013-01-06 19.596712 6.354261 -12.060103 30 15.0
In [59]:
df.apply(lambda x: x.max()- x.min())
Out[59]:
A    10.950721
B    26.695405
C    30.433983
D     0.000000
F     4.000000
dtype: float64
In [61]:
#Histogramming

s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[61]:
0    0
1    2
2    6
3    2
4    6
5    4
6    0
7    4
8    5
9    1
dtype: int32
In [62]:
s.value_counts()
Out[62]:
6    2
4    2
2    2
0    2
5    1
1    1
dtype: int64
In [63]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,'CABA','dog','cat'])

s.str.lower()
Out[63]:
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object
In [64]:
df = pd.DataFrame(np.random.randn(10,4))

df
Out[64]:
0 1 2 3
0 0.408694 0.075593 1.092644 0.067992
1 1.242592 -0.613958 1.302029 0.398823
2 0.571330 -1.211505 -1.642547 -0.963620
3 0.046183 -1.009694 0.987836 -1.104112
4 1.852863 -0.881583 -0.059117 1.300894
5 1.335388 -1.059745 -0.370339 1.793200
6 -0.470991 0.022208 -1.045180 -0.001308
7 0.469683 0.462645 0.291096 -1.369258
8 0.163337 2.233351 1.482339 0.027145
9 -0.114106 0.900924 -0.278806 -1.035798
In [65]:
df[:3]
Out[65]:
0 1 2 3
0 0.408694 0.075593 1.092644 0.067992
1 1.242592 -0.613958 1.302029 0.398823
2 0.571330 -1.211505 -1.642547 -0.963620
In [66]:
df[3:7]
Out[66]:
0 1 2 3
3 0.046183 -1.009694 0.987836 -1.104112
4 1.852863 -0.881583 -0.059117 1.300894
5 1.335388 -1.059745 -0.370339 1.793200
6 -0.470991 0.022208 -1.045180 -0.001308
In [67]:
df[7:]
Out[67]:
0 1 2 3
7 0.469683 0.462645 0.291096 -1.369258
8 0.163337 2.233351 1.482339 0.027145
9 -0.114106 0.900924 -0.278806 -1.035798
In [69]:
# break it into pieces

pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)
Out[69]:
0 1 2 3
0 0.408694 0.075593 1.092644 0.067992
1 1.242592 -0.613958 1.302029 0.398823
2 0.571330 -1.211505 -1.642547 -0.963620
3 0.046183 -1.009694 0.987836 -1.104112
4 1.852863 -0.881583 -0.059117 1.300894
5 1.335388 -1.059745 -0.370339 1.793200
6 -0.470991 0.022208 -1.045180 -0.001308
7 0.469683 0.462645 0.291096 -1.369258
8 0.163337 2.233351 1.482339 0.027145
9 -0.114106 0.900924 -0.278806 -1.035798
In [70]:
 #Database style joining
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left
Out[70]:
key lval
0 foo 1
1 foo 2
In [71]:
right
Out[71]:
key rval
0 foo 4
1 foo 5
In [72]:
pd.merge(left, right, on='key')
Out[72]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
In [73]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left
Out[73]:
key lval
0 foo 1
1 bar 2
In [74]:
right
Out[74]:
key rval
0 foo 4
1 bar 5
In [75]:
pd.merge(left, right, on='key')
Out[75]:
key lval rval
0 foo 1 4
1 bar 2 5
In [81]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

df
Out[81]:
A B C D
0 0.064288 1.073855 1.796381 -0.224552
1 0.352561 -0.506424 -0.718441 -0.257718
2 -1.137121 1.874188 -0.635592 0.281018
3 -0.736062 0.597500 0.333268 -0.172509
4 -1.309330 1.177244 0.967628 -1.198871
5 -0.376439 -0.805552 1.237947 0.720198
6 1.073608 0.473752 -0.398873 -0.825282
7 1.299102 0.704625 0.284264 -2.502382
In [82]:
s = df.iloc[3]

s
Out[82]:
A   -0.736062
B    0.597500
C    0.333268
D   -0.172509
Name: 3, dtype: float64
In [83]:
df.append(s, ignore_index=True)
Out[83]:
A B C D
0 0.064288 1.073855 1.796381 -0.224552
1 0.352561 -0.506424 -0.718441 -0.257718
2 -1.137121 1.874188 -0.635592 0.281018
3 -0.736062 0.597500 0.333268 -0.172509
4 -1.309330 1.177244 0.967628 -1.198871
5 -0.376439 -0.805552 1.237947 0.720198
6 1.073608 0.473752 -0.398873 -0.825282
7 1.299102 0.704625 0.284264 -2.502382
8 -0.736062 0.597500 0.333268 -0.172509
In [85]:
# By “group by” we are referring to a process involving one or more of the following steps
#Splitting the data into groups based on some criteria
#Applying a function to each group independently
#Combining the results into a data structure


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

df
Out[85]:
A B C D
0 foo one 0.265532 1.332233
1 bar one 0.479317 0.608824
2 foo two -1.106388 -0.257345
3 bar three -1.022975 -0.526723
4 foo two 0.300154 1.705182
5 bar two -1.526280 -0.828928
6 foo one 0.236994 -1.676835
7 foo three -0.054521 -1.251884
In [87]:
df.groupby('B').sum()
Out[87]:
C D
B
one 0.981842 0.264222
three -1.077496 -1.778607
two -2.332514 0.618910
In [88]:
df.groupby(['A','B']).sum()
Out[88]:
C D
A B
bar one 0.479317 0.608824
three -1.022975 -0.526723
two -1.526280 -0.828928
foo one 0.502526 -0.344602
three -0.054521 -1.251884
two -0.806234 1.447837
In [90]:
# hierrarchical Indexing and Reshaping

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                   'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                    'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

df2
Out[90]:
A B
first second
bar one 1.646276 -0.634797
two 0.589237 -0.464283
baz one 0.118206 1.219647
two 0.346672 -0.430455
In [91]:
# The stack() method “compresses” a level in the DataFrame’s columns.
stacked = df2.stack()

stacked
Out[91]:
first  second   
bar    one     A    1.646276
               B   -0.634797
       two     A    0.589237
               B   -0.464283
baz    one     A    0.118206
               B    1.219647
       two     A    0.346672
               B   -0.430455
dtype: float64
In [92]:
# Inverse operation of stack()

stacked.unstack()
Out[92]:
A B
first second
bar one 1.646276 -0.634797
two 0.589237 -0.464283
baz one 0.118206 1.219647
two 0.346672 -0.430455
In [93]:
stacked.unstack(1)
Out[93]:
second one two
first
bar A 1.646276 0.589237
B -0.634797 -0.464283
baz A 0.118206 0.346672
B 1.219647 -0.430455
In [94]:
stacked.unstack(0)
Out[94]:
first bar baz
second
one A 1.646276 0.118206
B -0.634797 1.219647
two A 0.589237 0.346672
B -0.464283 -0.430455
In [98]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

df
Out[98]:
A B C D E
0 one A foo -1.982610 0.448172
1 one B foo 0.062427 -0.744829
2 two C foo -0.745716 -0.313134
3 three A bar -0.835508 -1.152225
4 one B bar -0.859922 0.549435
5 one C bar 0.830919 -1.050315
6 two A foo 0.267296 -0.252132
7 three B foo 0.449456 1.477294
8 one C foo 0.123385 0.860408
9 one A bar 0.889326 0.223538
10 two B bar 0.486930 0.674512
11 three C bar -0.471057 -1.595881
In [99]:
pd.pivot_table(df, values='E', index=['A', 'B'], columns=['C'])
Out[99]:
C bar foo
A B
one A 0.223538 0.448172
B 0.549435 -0.744829
C -1.050315 0.860408
three A -1.152225 NaN
B NaN 1.477294
C -1.595881 NaN
two A NaN -0.252132
B 0.674512 NaN
C NaN -0.313134
In [100]:
#Time Series

rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min').sum()
Out[100]:
2012-01-01    24717
Freq: 5T, dtype: int32
In [101]:
#Timezone Representation

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

ts = pd.Series(np.random.randn(len(rng)), rng)

ts 
Out[101]:
2012-03-06   -0.328047
2012-03-07   -2.065020
2012-03-08   -0.602003
2012-03-09    2.039196
2012-03-10   -0.498256
Freq: D, dtype: float64
In [102]:
ts_utc = ts.tz_localize('UTC')

ts_utc
Out[102]:
2012-03-06 00:00:00+00:00   -0.328047
2012-03-07 00:00:00+00:00   -2.065020
2012-03-08 00:00:00+00:00   -0.602003
2012-03-09 00:00:00+00:00    2.039196
2012-03-10 00:00:00+00:00   -0.498256
Freq: D, dtype: float64
In [103]:
# Converting between time span representations

rng = pd.date_range('1/1/2012', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts
Out[103]:
2012-01-31   -0.833088
2012-02-29    0.198797
2012-03-31    1.426191
2012-04-30    1.431073
2012-05-31    1.357619
Freq: M, dtype: float64
In [104]:
ps = ts.to_period()


ps
Out[104]:
2012-01   -0.833088
2012-02    0.198797
2012-03    1.426191
2012-04    1.431073
2012-05    1.357619
Freq: M, dtype: float64
In [105]:
ps.to_timestamp()
Out[105]:
2012-01-01   -0.833088
2012-02-01    0.198797
2012-03-01    1.426191
2012-04-01    1.431073
2012-05-01    1.357619
Freq: MS, dtype: float64
In [106]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()
Out[106]:
1990-03-01 09:00   -0.354049
1990-06-01 09:00    0.230064
1990-09-01 09:00   -0.561966
1990-12-01 09:00    0.132596
1991-03-01 09:00    0.572927
Freq: H, dtype: float64
In [107]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
In [108]:
df["grade"] = df["raw_grade"].astype("category")

df["grade"]
Out[108]:
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
In [109]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
In [110]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df["grade"]
Out[110]:
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
In [111]:
df.sort_values(by="grade")
Out[111]:
id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good
In [112]:
df.groupby("grade").size()
Out[112]:
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64
In [116]:
# Plotting

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x21a92ced128>
In [117]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')
Out[117]:
<matplotlib.legend.Legend at 0x21a92e9b550>
In [118]:
df.to_csv('foo.csv')
In [119]:
 pd.read_csv('foo.csv')
Out[119]:
Unnamed: 0 A B C D
0 2000-01-01 -1.960527 -1.285968 -0.770272 1.732647
1 2000-01-02 -1.869375 -1.599188 -0.335176 1.961362
2 2000-01-03 -2.208295 -3.530446 0.266622 1.353577
3 2000-01-04 -1.727556 -3.565926 1.374389 1.917739
4 2000-01-05 -0.374437 -3.784898 -0.094169 2.786670
5 2000-01-06 -2.165799 -2.597103 -1.204030 2.745869
6 2000-01-07 -2.075078 -1.667786 -1.015625 3.632785
7 2000-01-08 -3.332855 -1.343136 -0.714277 5.110379
8 2000-01-09 -4.770663 -2.718791 2.343364 4.759776
9 2000-01-10 -5.007911 -3.119180 4.162103 7.178859
10 2000-01-11 -6.099561 -2.037347 4.652177 6.461335
11 2000-01-12 -4.538086 -1.593626 4.678336 5.706831
12 2000-01-13 -4.767281 -3.213769 5.378705 4.593024
13 2000-01-14 -6.067503 -2.167420 6.422238 4.398896
14 2000-01-15 -4.447423 -0.804950 8.166062 4.874810
15 2000-01-16 -4.034009 -0.723190 8.249881 4.416808
16 2000-01-17 -2.237815 -2.231110 6.861279 6.396465
17 2000-01-18 -2.750334 -2.457918 7.349380 6.311472
18 2000-01-19 -3.047330 -1.558872 7.433276 7.746978
19 2000-01-20 -1.490948 -1.525291 7.869527 8.170367
20 2000-01-21 -1.734994 -2.125683 7.048601 9.737116
21 2000-01-22 -1.710590 -3.667013 7.470567 9.537319
22 2000-01-23 -1.563084 -3.666055 7.867744 9.662418
23 2000-01-24 -2.484694 -3.232842 8.269260 9.339057
24 2000-01-25 -3.578921 -3.049872 7.872652 8.390124
25 2000-01-26 -4.244715 -2.754831 7.898890 8.161921
26 2000-01-27 -5.494957 -1.986982 7.331185 6.513367
27 2000-01-28 -5.528515 -2.813747 7.121798 7.129052
28 2000-01-29 -5.073192 -1.825633 6.207346 8.442592
29 2000-01-30 -7.780544 -1.076250 6.377382 7.883617
... ... ... ... ... ...
970 2002-08-28 -44.017723 -4.404109 16.899174 -3.480199
971 2002-08-29 -43.114688 -4.383443 17.528599 -1.689035
972 2002-08-30 -41.922337 -4.542466 16.670459 -2.249377
973 2002-08-31 -42.385755 -4.517940 18.834638 -3.075029
974 2002-09-01 -43.861665 -3.233773 19.172976 -3.295808
975 2002-09-02 -42.855793 -3.126281 17.267810 -1.674596
976 2002-09-03 -42.012861 -3.019981 17.629445 -3.778466
977 2002-09-04 -41.173032 -4.188818 17.295629 -4.378031
978 2002-09-05 -40.541550 -4.241131 17.236057 -3.671891
979 2002-09-06 -41.425338 -4.205773 16.088178 -4.459475
980 2002-09-07 -41.521988 -6.307009 15.527701 -5.666109
981 2002-09-08 -41.677102 -5.661782 14.303965 -4.337424
982 2002-09-09 -40.586610 -5.694311 15.961002 -3.889111
983 2002-09-10 -39.825491 -6.402531 15.006266 -2.606617
984 2002-09-11 -39.913708 -5.881342 14.111499 -3.717421
985 2002-09-12 -39.139080 -7.662673 13.700208 -4.313833
986 2002-09-13 -38.767603 -8.796602 13.802048 -4.249081
987 2002-09-14 -38.149622 -7.482174 15.204716 -4.286389
988 2002-09-15 -37.132710 -7.029820 16.056653 -3.609412
989 2002-09-16 -35.220422 -6.245927 17.128572 -2.957648
990 2002-09-17 -36.093959 -6.413961 18.150520 -2.637952
991 2002-09-18 -36.402336 -5.096145 18.317224 -1.837115
992 2002-09-19 -33.749346 -5.860567 18.575125 -0.991736
993 2002-09-20 -32.394332 -5.664858 20.529412 -3.751172
994 2002-09-21 -34.574008 -4.575883 20.004798 -4.152625
995 2002-09-22 -36.980502 -5.331763 20.361332 -3.797302
996 2002-09-23 -37.399011 -5.330728 21.173518 -3.465505
997 2002-09-24 -37.767723 -3.607712 21.651700 -3.331427
998 2002-09-25 -36.151342 -2.315392 22.765483 -2.369131
999 2002-09-26 -38.292146 -1.700859 22.142733 -3.808816

1000 rows × 5 columns

In [120]:
df.to_hdf('foo.h5','df')
In [121]:
pd.read_hdf('foo.h5','df')
Out[121]:
A B C D
2000-01-01 -1.960527 -1.285968 -0.770272 1.732647
2000-01-02 -1.869375 -1.599188 -0.335176 1.961362
2000-01-03 -2.208295 -3.530446 0.266622 1.353577
2000-01-04 -1.727556 -3.565926 1.374389 1.917739
2000-01-05 -0.374437 -3.784898 -0.094169 2.786670
2000-01-06 -2.165799 -2.597103 -1.204030 2.745869
2000-01-07 -2.075078 -1.667786 -1.015625 3.632785
2000-01-08 -3.332855 -1.343136 -0.714277 5.110379
2000-01-09 -4.770663 -2.718791 2.343364 4.759776
2000-01-10 -5.007911 -3.119180 4.162103 7.178859
2000-01-11 -6.099561 -2.037347 4.652177 6.461335
2000-01-12 -4.538086 -1.593626 4.678336 5.706831
2000-01-13 -4.767281 -3.213769 5.378705 4.593024
2000-01-14 -6.067503 -2.167420 6.422238 4.398896
2000-01-15 -4.447423 -0.804950 8.166062 4.874810
2000-01-16 -4.034009 -0.723190 8.249881 4.416808
2000-01-17 -2.237815 -2.231110 6.861279 6.396465
2000-01-18 -2.750334 -2.457918 7.349380 6.311472
2000-01-19 -3.047330 -1.558872 7.433276 7.746978
2000-01-20 -1.490948 -1.525291 7.869527 8.170367
2000-01-21 -1.734994 -2.125683 7.048601 9.737116
2000-01-22 -1.710590 -3.667013 7.470567 9.537319
2000-01-23 -1.563084 -3.666055 7.867744 9.662418
2000-01-24 -2.484694 -3.232842 8.269260 9.339057
2000-01-25 -3.578921 -3.049872 7.872652 8.390124
2000-01-26 -4.244715 -2.754831 7.898890 8.161921
2000-01-27 -5.494957 -1.986982 7.331185 6.513367
2000-01-28 -5.528515 -2.813747 7.121798 7.129052
2000-01-29 -5.073192 -1.825633 6.207346 8.442592
2000-01-30 -7.780544 -1.076250 6.377382 7.883617
... ... ... ... ...
2002-08-28 -44.017723 -4.404109 16.899174 -3.480199
2002-08-29 -43.114688 -4.383443 17.528599 -1.689035
2002-08-30 -41.922337 -4.542466 16.670459 -2.249377
2002-08-31 -42.385755 -4.517940 18.834638 -3.075029
2002-09-01 -43.861665 -3.233773 19.172976 -3.295808
2002-09-02 -42.855793 -3.126281 17.267810 -1.674596
2002-09-03 -42.012861 -3.019981 17.629445 -3.778466
2002-09-04 -41.173032 -4.188818 17.295629 -4.378031
2002-09-05 -40.541550 -4.241131 17.236057 -3.671891
2002-09-06 -41.425338 -4.205773 16.088178 -4.459475
2002-09-07 -41.521988 -6.307009 15.527701 -5.666109
2002-09-08 -41.677102 -5.661782 14.303965 -4.337424
2002-09-09 -40.586610 -5.694311 15.961002 -3.889111
2002-09-10 -39.825491 -6.402531 15.006266 -2.606617
2002-09-11 -39.913708 -5.881342 14.111499 -3.717421
2002-09-12 -39.139080 -7.662673 13.700208 -4.313833
2002-09-13 -38.767603 -8.796602 13.802048 -4.249081
2002-09-14 -38.149622 -7.482174 15.204716 -4.286389
2002-09-15 -37.132710 -7.029820 16.056653 -3.609412
2002-09-16 -35.220422 -6.245927 17.128572 -2.957648
2002-09-17 -36.093959 -6.413961 18.150520 -2.637952
2002-09-18 -36.402336 -5.096145 18.317224 -1.837115
2002-09-19 -33.749346 -5.860567 18.575125 -0.991736
2002-09-20 -32.394332 -5.664858 20.529412 -3.751172
2002-09-21 -34.574008 -4.575883 20.004798 -4.152625
2002-09-22 -36.980502 -5.331763 20.361332 -3.797302
2002-09-23 -37.399011 -5.330728 21.173518 -3.465505
2002-09-24 -37.767723 -3.607712 21.651700 -3.331427
2002-09-25 -36.151342 -2.315392 22.765483 -2.369131
2002-09-26 -38.292146 -1.700859 22.142733 -3.808816

1000 rows × 4 columns

In [122]:
df.to_excel('foo.xlsx', sheet_name='Sheet1')
In [123]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
Out[123]:
A B C D
2000-01-01 -1.960527 -1.285968 -0.770272 1.732647
2000-01-02 -1.869375 -1.599188 -0.335176 1.961362
2000-01-03 -2.208295 -3.530446 0.266622 1.353577
2000-01-04 -1.727556 -3.565926 1.374389 1.917739
2000-01-05 -0.374437 -3.784898 -0.094169 2.786670
2000-01-06 -2.165799 -2.597103 -1.204030 2.745869
2000-01-07 -2.075078 -1.667786 -1.015625 3.632785
2000-01-08 -3.332855 -1.343136 -0.714277 5.110379
2000-01-09 -4.770663 -2.718791 2.343364 4.759776
2000-01-10 -5.007911 -3.119180 4.162103 7.178859
2000-01-11 -6.099561 -2.037347 4.652177 6.461335
2000-01-12 -4.538086 -1.593626 4.678336 5.706831
2000-01-13 -4.767281 -3.213769 5.378705 4.593024
2000-01-14 -6.067503 -2.167420 6.422238 4.398896
2000-01-15 -4.447423 -0.804950 8.166062 4.874810
2000-01-16 -4.034009 -0.723190 8.249881 4.416808
2000-01-17 -2.237815 -2.231110 6.861279 6.396465
2000-01-18 -2.750334 -2.457918 7.349380 6.311472
2000-01-19 -3.047330 -1.558872 7.433276 7.746978
2000-01-20 -1.490948 -1.525291 7.869527 8.170367
2000-01-21 -1.734994 -2.125683 7.048601 9.737116
2000-01-22 -1.710590 -3.667013 7.470567 9.537319
2000-01-23 -1.563084 -3.666055 7.867744 9.662418
2000-01-24 -2.484694 -3.232842 8.269260 9.339057
2000-01-25 -3.578921 -3.049872 7.872652 8.390124
2000-01-26 -4.244715 -2.754831 7.898890 8.161921
2000-01-27 -5.494957 -1.986982 7.331185 6.513367
2000-01-28 -5.528515 -2.813747 7.121798 7.129052
2000-01-29 -5.073192 -1.825633 6.207346 8.442592
2000-01-30 -7.780544 -1.076250 6.377382 7.883617
... ... ... ... ...
2002-08-28 -44.017723 -4.404109 16.899174 -3.480199
2002-08-29 -43.114688 -4.383443 17.528599 -1.689035
2002-08-30 -41.922337 -4.542466 16.670459 -2.249377
2002-08-31 -42.385755 -4.517940 18.834638 -3.075029
2002-09-01 -43.861665 -3.233773 19.172976 -3.295808
2002-09-02 -42.855793 -3.126281 17.267810 -1.674596
2002-09-03 -42.012861 -3.019981 17.629445 -3.778466
2002-09-04 -41.173032 -4.188818 17.295629 -4.378031
2002-09-05 -40.541550 -4.241131 17.236057 -3.671891
2002-09-06 -41.425338 -4.205773 16.088178 -4.459475
2002-09-07 -41.521988 -6.307009 15.527701 -5.666109
2002-09-08 -41.677102 -5.661782 14.303965 -4.337424
2002-09-09 -40.586610 -5.694311 15.961002 -3.889111
2002-09-10 -39.825491 -6.402531 15.006266 -2.606617
2002-09-11 -39.913708 -5.881342 14.111499 -3.717421
2002-09-12 -39.139080 -7.662673 13.700208 -4.313833
2002-09-13 -38.767603 -8.796602 13.802048 -4.249081
2002-09-14 -38.149622 -7.482174 15.204716 -4.286389
2002-09-15 -37.132710 -7.029820 16.056653 -3.609412
2002-09-16 -35.220422 -6.245927 17.128572 -2.957648
2002-09-17 -36.093959 -6.413961 18.150520 -2.637952
2002-09-18 -36.402336 -5.096145 18.317224 -1.837115
2002-09-19 -33.749346 -5.860567 18.575125 -0.991736
2002-09-20 -32.394332 -5.664858 20.529412 -3.751172
2002-09-21 -34.574008 -4.575883 20.004798 -4.152625
2002-09-22 -36.980502 -5.331763 20.361332 -3.797302
2002-09-23 -37.399011 -5.330728 21.173518 -3.465505
2002-09-24 -37.767723 -3.607712 21.651700 -3.331427
2002-09-25 -36.151342 -2.315392 22.765483 -2.369131
2002-09-26 -38.292146 -1.700859 22.142733 -3.808816

1000 rows × 4 columns

Comments

Popular posts from this blog

Exercise 2 - 3

Matplot LIbrary