Pandas Library
In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
# One-dimensional ndarray with axis labels (including time series).
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[2]:
In [3]:
#Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).
#Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects.
#The primary pandas data structure
dates = pd.date_range('20130101', periods =6)
dates
df = pd.DataFrame(np.random.randn(6,4)*10, index=dates, columns = list ('ABCD'))
df
Out[3]:
In [4]:
#Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame ({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1,index=list(range(4)),dtype = 'float32'),
'D': np.array ([3] *4, dtype = 'int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
df2
Out[4]:
In [5]:
df2.dtypes
Out[5]:
In [6]:
#Subset of the attributes
df2.<TAB>
In [8]:
df.head()
Out[8]:
In [9]:
df.tail(3)
Out[9]:
In [10]:
df.index
Out[10]:
In [11]:
df.columns
Out[11]:
In [12]:
df.values
Out[12]:
In [13]:
# Describe shows a quick statistic summary of your data
df.describe()
Out[13]:
In [14]:
# Transposing your data
df.T
Out[14]:
In [15]:
# Sorting by an axis
df.sort_index(axis=1, ascending=False)
Out[15]:
In [16]:
df.sort_values(by='B')
Out[16]:
In [17]:
df['A']
Out[17]:
In [18]:
df[0:3]
Out[18]:
In [19]:
df['20130102':'20130104']
Out[19]:
In [20]:
#For getting a cross section using a label
df.loc[dates[0]]
Out[20]:
In [21]:
df.loc[:,['A', 'B']]
Out[21]:
In [22]:
df.loc ['20130102':'20130104', ['A', 'B']]
Out[22]:
In [23]:
df.loc['20130102',['A','B']]
Out[23]:
In [24]:
df.loc[dates[0], 'A']
Out[24]:
In [25]:
df.at[dates[0],'A']
Out[25]:
In [26]:
#Select via the position of the passed integers
df.iloc[3]
Out[26]:
In [27]:
df.iloc[3:5,0:2]
Out[27]:
In [28]:
df.iloc[[1,2,4],[0,2]]
Out[28]:
In [29]:
df.iloc[1:3,:]
Out[29]:
In [30]:
df.iloc[1,1]
Out[30]:
In [31]:
df.iat[1,1]
Out[31]:
In [33]:
#Boolean Indexing
#Using a single column’s values to select data.
df[df.A > 5]
Out[33]:
In [34]:
# Selecting values from a DataFrame where a boolean condition is met.
df[df > 5]
Out[34]:
In [36]:
# Using the isin() method for filtering:
df2 = df.copy()
df2['E'] = ['one', 'two', 'two', 'three', 'four', 'three']
df2
Out[36]:
In [37]:
df2[df2['E'].isin(['two','four'])]
Out[37]:
In [38]:
#Setting a new column automatically aligns the data by the indexes
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods= 6))
s1
Out[38]:
In [39]:
df['F'] = s1
In [40]:
# Setting values by label
df.at[dates[0],'A'] = 0
In [41]:
# Setting values by position
df.iat[0,1] =0
In [42]:
# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array ([5] * len(df))
In [43]:
df
Out[43]:
In [44]:
# A where operation with setting
df2 = df.copy()
df2[df2 > 0]= -df2
df2
Out[44]:
In [46]:
# Reindexing allows you to change/add/delete the index on a specified axis.
# This returns a copy of the data.
df1 = df.reindex(index=dates[0:4], columns = list(df.columns) + ['E'])
df1.loc[dates[0]:dates[2], 'E'] = 1
df1
Out[46]:
In [47]:
# To drop any rows that have missing data.
df1.dropna(how='any')
Out[47]:
In [48]:
# Filling missing data
df1.fillna(value=6)
Out[48]:
In [50]:
# To get the boolean mask where values are nan
pd.isnull(df1)
Out[50]:
In [51]:
df.mean ()
Out[51]:
In [52]:
df.mean(1)
Out[52]:
In [56]:
# Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.
s = pd.Series([1,3,5,np.nan,6,8], index= dates).shift(1)
s
Out[56]:
In [57]:
df.sub(s,axis='index')
Out[57]:
In [58]:
#Applying functions to the data
df.apply(np.cumsum)
Out[58]:
In [59]:
df.apply(lambda x: x.max()- x.min())
Out[59]:
In [61]:
#Histogramming
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[61]:
In [62]:
s.value_counts()
Out[62]:
In [63]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,'CABA','dog','cat'])
s.str.lower()
Out[63]:
In [64]:
df = pd.DataFrame(np.random.randn(10,4))
df
Out[64]:
In [65]:
df[:3]
Out[65]:
In [66]:
df[3:7]
Out[66]:
In [67]:
df[7:]
Out[67]:
In [69]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
Out[69]:
In [70]:
#Database style joining
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
Out[70]:
In [71]:
right
Out[71]:
In [72]:
pd.merge(left, right, on='key')
Out[72]:
In [73]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
Out[73]:
In [74]:
right
Out[74]:
In [75]:
pd.merge(left, right, on='key')
Out[75]:
In [81]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
Out[81]:
In [82]:
s = df.iloc[3]
s
Out[82]:
In [83]:
df.append(s, ignore_index=True)
Out[83]:
In [85]:
# By “group by” we are referring to a process involving one or more of the following steps
#Splitting the data into groups based on some criteria
#Applying a function to each group independently
#Combining the results into a data structure
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
Out[85]:
In [87]:
df.groupby('B').sum()
Out[87]:
In [88]:
df.groupby(['A','B']).sum()
Out[88]:
In [90]:
# hierrarchical Indexing and Reshaping
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2
Out[90]:
In [91]:
# The stack() method “compresses” a level in the DataFrame’s columns.
stacked = df2.stack()
stacked
Out[91]:
In [92]:
# Inverse operation of stack()
stacked.unstack()
Out[92]:
In [93]:
stacked.unstack(1)
Out[93]:
In [94]:
stacked.unstack(0)
Out[94]:
In [98]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
Out[98]:
In [99]:
pd.pivot_table(df, values='E', index=['A', 'B'], columns=['C'])
Out[99]:
In [100]:
#Time Series
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()
Out[100]:
In [101]:
#Timezone Representation
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
Out[101]:
In [102]:
ts_utc = ts.tz_localize('UTC')
ts_utc
Out[102]:
In [103]:
# Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
Out[103]:
In [104]:
ps = ts.to_period()
ps
Out[104]:
In [105]:
ps.to_timestamp()
Out[105]:
In [106]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
Out[106]:
In [107]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
In [108]:
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
Out[108]:
In [109]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
In [110]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
Out[110]:
In [111]:
df.sort_values(by="grade")
Out[111]:
In [112]:
df.groupby("grade").size()
Out[112]:
In [116]:
# Plotting
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
Out[116]:
In [117]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
Out[117]:
In [118]:
df.to_csv('foo.csv')
In [119]:
pd.read_csv('foo.csv')
Out[119]:
In [120]:
df.to_hdf('foo.h5','df')
In [121]:
pd.read_hdf('foo.h5','df')
Out[121]:
In [122]:
df.to_excel('foo.xlsx', sheet_name='Sheet1')
In [123]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
Out[123]:
Comments
Post a Comment