import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# One-dimensional ndarray with axis labels (including time series).

s = pd.Series([1,3,5,np.nan,6,8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).
#Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. 
#The primary pandas data structure


dates = pd.date_range('20130101', periods =6)

dates

df = pd.DataFrame(np.random.randn(6,4)*10, index=dates, columns = list ('ABCD'))

df

#Creating a DataFrame by passing a dict of objects that can be converted to series-like.

df2 = pd.DataFrame ({'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1,index=list(range(4)),dtype = 'float32'),
                     'D': np.array ([3] *4, dtype = 'int32'),
                     'E': pd.Categorical(["test", "train", "test", "train"]),
                     'F': 'foo'})

df2

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#Subset of the attributes

df2.<TAB>

  File "<ipython-input-6-f3c61b116133>", line 3
    df2.<TAB>
        ^
SyntaxError: invalid syntax

df.head()

df.tail(3)

df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

df.values

array([[  5.7012378 ,   6.85505883,   7.28001198,  -7.7491236 ],
       [  7.70938659,  15.03031926, -12.74426791,   5.10285248],
       [ -1.80135618,   3.93479393,  -5.57257698,  -3.97035788],
       [  3.50209395,  -2.08442948, -16.14170628,  -5.25054536],
       [  9.14936458,   1.13866273,  14.29227682,   7.21553986],
       [  1.03722287, -11.66508526,   0.82615942,  11.33784591]])

# Describe shows a quick statistic summary of your data

df.describe()

# Transposing your data

df.T

# Sorting by an axis
df.sort_index(axis=1, ascending=False)

 df.sort_values(by='B')

df['A']

2013-01-01    5.701238
2013-01-02    7.709387
2013-01-03   -1.801356
2013-01-04    3.502094
2013-01-05    9.149365
2013-01-06    1.037223
Freq: D, Name: A, dtype: float64

df[0:3]

df['20130102':'20130104']

#For getting a cross section using a label

df.loc[dates[0]]

A    5.701238
B    6.855059
C    7.280012
D   -7.749124
Name: 2013-01-01 00:00:00, dtype: float64

df.loc[:,['A', 'B']]

df.loc ['20130102':'20130104', ['A', 'B']]

df.loc['20130102',['A','B']]

A     7.709387
B    15.030319
Name: 2013-01-02 00:00:00, dtype: float64

df.loc[dates[0], 'A']

5.7012378049110328

df.at[dates[0],'A']

5.7012378049110328

#Select via the position of the passed integers

df.iloc[3]

A     3.502094
B    -2.084429
C   -16.141706
D    -5.250545
Name: 2013-01-04 00:00:00, dtype: float64

df.iloc[3:5,0:2]

df.iloc[[1,2,4],[0,2]]

df.iloc[1:3,:]

df.iloc[1,1]

15.030319261378509

df.iat[1,1]

15.030319261378509

#Boolean Indexing

#Using a single column’s values to select data.

df[df.A > 5]

# Selecting values from a DataFrame where a boolean condition is met.

df[df > 5]

# Using the isin() method for filtering:

df2 = df.copy()

df2['E'] = ['one', 'two', 'two', 'three', 'four', 'three']

df2

df2[df2['E'].isin(['two','four'])]

#Setting a new column automatically aligns the data by the indexes

s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods= 6))

s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

df['F'] = s1

# Setting values by label

df.at[dates[0],'A'] = 0

# Setting values by position

df.iat[0,1] =0

# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array ([5] * len(df))

df

# A where operation with setting

df2 = df.copy()

df2[df2 > 0]= -df2

df2

# Reindexing allows you to change/add/delete the index on a specified axis.
# This returns a copy of the data.

df1 = df.reindex(index=dates[0:4], columns = list(df.columns) + ['E'])

df1.loc[dates[0]:dates[2], 'E'] = 1

df1

# To drop any rows that have missing data.

df1.dropna(how='any')

# Filling missing data

df1.fillna(value=6)

# To get the boolean mask where values are nan

pd.isnull(df1)

df.mean ()

A    3.266119
B    1.059044
C   -2.010017
D    5.000000
F    3.000000
dtype: float64

df.mean(1)

2013-01-01    3.070003
2013-01-02    3.199088
2013-01-03    0.712172
2013-01-04   -1.344808
2013-01-05    6.716061
2013-01-06    0.039659
Freq: D, dtype: float64

# Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.

s = pd.Series([1,3,5,np.nan,6,8], index= dates).shift(1)

s

2013-01-01    NaN
2013-01-02    1.0
2013-01-03    3.0
2013-01-04    5.0
2013-01-05    NaN
2013-01-06    6.0
Freq: D, dtype: float64

df.sub(s,axis='index')

#Applying functions to the data

df.apply(np.cumsum)

df.apply(lambda x: x.max()- x.min())

A    10.950721
B    26.695405
C    30.433983
D     0.000000
F     4.000000
dtype: float64

#Histogramming

s = pd.Series(np.random.randint(0, 7, size=10))
s

0    0
1    2
2    6
3    2
4    6
5    4
6    0
7    4
8    5
9    1
dtype: int32

s.value_counts()

6    2
4    2
2    2
0    2
5    1
1    1
dtype: int64

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,'CABA','dog','cat'])

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

df = pd.DataFrame(np.random.randn(10,4))

df

df[:3]

df[3:7]

df[7:]

# break it into pieces

pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)

 #Database style joining
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

df

s = df.iloc[3]

s

A   -0.736062
B    0.597500
C    0.333268
D   -0.172509
Name: 3, dtype: float64

df.append(s, ignore_index=True)

# By “group by” we are referring to a process involving one or more of the following steps
#Splitting the data into groups based on some criteria
#Applying a function to each group independently
#Combining the results into a data structure


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

df

df.groupby('B').sum()

df.groupby(['A','B']).sum()

# hierrarchical Indexing and Reshaping

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                   'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                    'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

df2

# The stack() method “compresses” a level in the DataFrame’s columns.
stacked = df2.stack()

stacked

first  second   
bar    one     A    1.646276
               B   -0.634797
       two     A    0.589237
               B   -0.464283
baz    one     A    0.118206
               B    1.219647
       two     A    0.346672
               B   -0.430455
dtype: float64

# Inverse operation of stack()

stacked.unstack()

stacked.unstack(1)

stacked.unstack(0)

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

df

pd.pivot_table(df, values='E', index=['A', 'B'], columns=['C'])

#Time Series

rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min').sum()

2012-01-01    24717
Freq: 5T, dtype: int32

#Timezone Representation

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

ts = pd.Series(np.random.randn(len(rng)), rng)

ts

2012-03-06   -0.328047
2012-03-07   -2.065020
2012-03-08   -0.602003
2012-03-09    2.039196
2012-03-10   -0.498256
Freq: D, dtype: float64

ts_utc = ts.tz_localize('UTC')

ts_utc

2012-03-06 00:00:00+00:00   -0.328047
2012-03-07 00:00:00+00:00   -2.065020
2012-03-08 00:00:00+00:00   -0.602003
2012-03-09 00:00:00+00:00    2.039196
2012-03-10 00:00:00+00:00   -0.498256
Freq: D, dtype: float64

# Converting between time span representations

rng = pd.date_range('1/1/2012', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

2012-01-31   -0.833088
2012-02-29    0.198797
2012-03-31    1.426191
2012-04-30    1.431073
2012-05-31    1.357619
Freq: M, dtype: float64

ps = ts.to_period()


ps

2012-01   -0.833088
2012-02    0.198797
2012-03    1.426191
2012-04    1.431073
2012-05    1.357619
Freq: M, dtype: float64

ps.to_timestamp()

2012-01-01   -0.833088
2012-02-01    0.198797
2012-03-01    1.426191
2012-04-01    1.431073
2012-05-01    1.357619
Freq: MS, dtype: float64

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()

1990-03-01 09:00   -0.354049
1990-06-01 09:00    0.230064
1990-09-01 09:00   -0.561966
1990-12-01 09:00    0.132596
1991-03-01 09:00    0.572927
Freq: H, dtype: float64

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df["grade"] = df["raw_grade"].astype("category")

df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

df["grade"].cat.categories = ["very good", "good", "very bad"]

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

df.sort_values(by="grade")

df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

# Plotting

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x21a92ced128>

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')

<matplotlib.legend.Legend at 0x21a92e9b550>

df.to_csv('foo.csv')

 pd.read_csv('foo.csv')

df.to_hdf('foo.h5','df')

pd.read_hdf('foo.h5','df')

df.to_excel('foo.xlsx', sheet_name='Sheet1')

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	4.216325	2.201553	-2.010017	1.114369
std	4.136922	8.955663	11.728034	7.779201
min	-1.801356	-11.665085	-16.141706	-7.749124
25%	1.653441	-1.278656	-10.951345	-4.930498
50%	4.601666	2.536728	-2.373209	0.566247
75%	7.207349	6.124993	5.666549	6.687368
max	9.149365	15.030319	14.292277	11.337846

	0	1	2	3
0	0.408694	0.075593	1.092644	0.067992
1	1.242592	-0.613958	1.302029	0.398823
2	0.571330	-1.211505	-1.642547	-0.963620
3	0.046183	-1.009694	0.987836	-1.104112
4	1.852863	-0.881583	-0.059117	1.300894
5	1.335388	-1.059745	-0.370339	1.793200
6	-0.470991	0.022208	-1.045180	-0.001308
7	0.469683	0.462645	0.291096	-1.369258
8	0.163337	2.233351	1.482339	0.027145
9	-0.114106	0.900924	-0.278806	-1.035798

	0	1	2	3
0	0.408694	0.075593	1.092644	0.067992
1	1.242592	-0.613958	1.302029	0.398823
2	0.571330	-1.211505	-1.642547	-0.963620

	0	1	2	3
3	0.046183	-1.009694	0.987836	-1.104112
4	1.852863	-0.881583	-0.059117	1.300894
5	1.335388	-1.059745	-0.370339	1.793200
6	-0.470991	0.022208	-1.045180	-0.001308

	0	1	2	3
7	0.469683	0.462645	0.291096	-1.369258
8	0.163337	2.233351	1.482339	0.027145
9	-0.114106	0.900924	-0.278806	-1.035798

Search This Blog

Datascience Ka Naya Khiladi

Pandas Library

Comments

Post a Comment

Popular posts from this blog

Exercise 2 - 3

Matplot LIbrary

	A	B	C	D
2013-01-01	5.701238	6.855059	7.280012	-7.749124
2013-01-02	7.709387	15.030319	-12.744268	5.102852
2013-01-03	-1.801356	3.934794	-5.572577	-3.970358
2013-01-04	3.502094	-2.084429	-16.141706	-5.250545
2013-01-05	9.149365	1.138663	14.292277	7.215540
2013-01-06	1.037223	-11.665085	0.826159	11.337846

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

	A	B	C	D	F
2013-01-01	0.000000	0.000000	7.280012	5	NaN
2013-01-02	7.709387	15.030319	-12.744268	5	1.0
2013-01-03	-1.801356	3.934794	-5.572577	5	2.0
2013-01-04	3.502094	-2.084429	-16.141706	5	3.0
2013-01-05	9.149365	1.138663	14.292277	5	4.0
2013-01-06	1.037223	-11.665085	0.826159	5	5.0

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	False
2013-01-02	False	False	False	False	False	False
2013-01-03	False	False	False	False	False	False
2013-01-04	False	False	False	False	False	True

	A	B	C	D	F
2013-01-01	NaN	NaN	NaN	NaN	NaN
2013-01-02	6.709387	14.030319	-13.744268	4.0	0.0
2013-01-03	-4.801356	0.934794	-8.572577	2.0	-1.0
2013-01-04	-1.497906	-7.084429	-21.141706	0.0	-2.0
2013-01-05	NaN	NaN	NaN	NaN	NaN
2013-01-06	-4.962777	-17.665085	-5.173841	-1.0	-1.0

	A	B	C	D
0	0.064288	1.073855	1.796381	-0.224552
1	0.352561	-0.506424	-0.718441	-0.257718
2	-1.137121	1.874188	-0.635592	0.281018
3	-0.736062	0.597500	0.333268	-0.172509
4	-1.309330	1.177244	0.967628	-1.198871
5	-0.376439	-0.805552	1.237947	0.720198
6	1.073608	0.473752	-0.398873	-0.825282
7	1.299102	0.704625	0.284264	-2.502382

	A	B	C	D
0	foo	one	0.265532	1.332233
1	bar	one	0.479317	0.608824
2	foo	two	-1.106388	-0.257345
3	bar	three	-1.022975	-0.526723
4	foo	two	0.300154	1.705182
5	bar	two	-1.526280	-0.828928
6	foo	one	0.236994	-1.676835
7	foo	three	-0.054521	-1.251884

	C	D
B
one	0.981842	0.264222
three	-1.077496	-1.778607
two	-2.332514	0.618910

		A	B
first	second
bar	one	1.646276	-0.634797
bar	two	0.589237	-0.464283
baz	one	0.118206	1.219647
baz	two	0.346672	-0.430455

	A	B	C	D	E
0	one	A	foo	-1.982610	0.448172
1	one	B	foo	0.062427	-0.744829
2	two	C	foo	-0.745716	-0.313134
3	three	A	bar	-0.835508	-1.152225
4	one	B	bar	-0.859922	0.549435
5	one	C	bar	0.830919	-1.050315
6	two	A	foo	0.267296	-0.252132
7	three	B	foo	0.449456	1.477294
8	one	C	foo	0.123385	0.860408
9	one	A	bar	0.889326	0.223538
10	two	B	bar	0.486930	0.674512
11	three	C	bar	-0.471057	-1.595881

	A	B	C	D	F
2013-01-01	0.000000	0.000000	7.280012	5	NaN
2013-01-02	7.709387	15.030319	-5.464256	10	1.0
2013-01-03	5.908030	18.965113	-11.036833	15	3.0
2013-01-04	9.410124	16.880684	-27.178539	20	6.0
2013-01-05	18.559489	18.019346	-12.886262	25	10.0
2013-01-06	19.596712	6.354261	-12.060103	30	15.0

	key	lval
0	foo	1
1	foo	2

	key	rval
0	foo	4
1	foo	5

	key	lval
0	foo	1
1	bar	2

	id	raw_grade	grade
5	6	e	very bad
1	2	b	good
2	3	b	good
0	1	a	very good
3	4	a	very good
4	5	a	very good

	Unnamed: 0	A	B	C	D
0	2000-01-01	-1.960527	-1.285968	-0.770272	1.732647
1	2000-01-02	-1.869375	-1.599188	-0.335176	1.961362
2	2000-01-03	-2.208295	-3.530446	0.266622	1.353577
3	2000-01-04	-1.727556	-3.565926	1.374389	1.917739
4	2000-01-05	-0.374437	-3.784898	-0.094169	2.786670
5	2000-01-06	-2.165799	-2.597103	-1.204030	2.745869
6	2000-01-07	-2.075078	-1.667786	-1.015625	3.632785
7	2000-01-08	-3.332855	-1.343136	-0.714277	5.110379
8	2000-01-09	-4.770663	-2.718791	2.343364	4.759776
9	2000-01-10	-5.007911	-3.119180	4.162103	7.178859
10	2000-01-11	-6.099561	-2.037347	4.652177	6.461335
11	2000-01-12	-4.538086	-1.593626	4.678336	5.706831
12	2000-01-13	-4.767281	-3.213769	5.378705	4.593024
13	2000-01-14	-6.067503	-2.167420	6.422238	4.398896
14	2000-01-15	-4.447423	-0.804950	8.166062	4.874810
15	2000-01-16	-4.034009	-0.723190	8.249881	4.416808
16	2000-01-17	-2.237815	-2.231110	6.861279	6.396465
17	2000-01-18	-2.750334	-2.457918	7.349380	6.311472
18	2000-01-19	-3.047330	-1.558872	7.433276	7.746978
19	2000-01-20	-1.490948	-1.525291	7.869527	8.170367
20	2000-01-21	-1.734994	-2.125683	7.048601	9.737116
21	2000-01-22	-1.710590	-3.667013	7.470567	9.537319
22	2000-01-23	-1.563084	-3.666055	7.867744	9.662418
23	2000-01-24	-2.484694	-3.232842	8.269260	9.339057
24	2000-01-25	-3.578921	-3.049872	7.872652	8.390124
25	2000-01-26	-4.244715	-2.754831	7.898890	8.161921
26	2000-01-27	-5.494957	-1.986982	7.331185	6.513367
27	2000-01-28	-5.528515	-2.813747	7.121798	7.129052
28	2000-01-29	-5.073192	-1.825633	6.207346	8.442592
29	2000-01-30	-7.780544	-1.076250	6.377382	7.883617
...	...	...	...	...	...
970	2002-08-28	-44.017723	-4.404109	16.899174	-3.480199
971	2002-08-29	-43.114688	-4.383443	17.528599	-1.689035
972	2002-08-30	-41.922337	-4.542466	16.670459	-2.249377
973	2002-08-31	-42.385755	-4.517940	18.834638	-3.075029
974	2002-09-01	-43.861665	-3.233773	19.172976	-3.295808
975	2002-09-02	-42.855793	-3.126281	17.267810	-1.674596
976	2002-09-03	-42.012861	-3.019981	17.629445	-3.778466
977	2002-09-04	-41.173032	-4.188818	17.295629	-4.378031
978	2002-09-05	-40.541550	-4.241131	17.236057	-3.671891
979	2002-09-06	-41.425338	-4.205773	16.088178	-4.459475
980	2002-09-07	-41.521988	-6.307009	15.527701	-5.666109
981	2002-09-08	-41.677102	-5.661782	14.303965	-4.337424
982	2002-09-09	-40.586610	-5.694311	15.961002	-3.889111
983	2002-09-10	-39.825491	-6.402531	15.006266	-2.606617
984	2002-09-11	-39.913708	-5.881342	14.111499	-3.717421
985	2002-09-12	-39.139080	-7.662673	13.700208	-4.313833
986	2002-09-13	-38.767603	-8.796602	13.802048	-4.249081
987	2002-09-14	-38.149622	-7.482174	15.204716	-4.286389
988	2002-09-15	-37.132710	-7.029820	16.056653	-3.609412
989	2002-09-16	-35.220422	-6.245927	17.128572	-2.957648
990	2002-09-17	-36.093959	-6.413961	18.150520	-2.637952
991	2002-09-18	-36.402336	-5.096145	18.317224	-1.837115
992	2002-09-19	-33.749346	-5.860567	18.575125	-0.991736
993	2002-09-20	-32.394332	-5.664858	20.529412	-3.751172
994	2002-09-21	-34.574008	-4.575883	20.004798	-4.152625
995	2002-09-22	-36.980502	-5.331763	20.361332	-3.797302
996	2002-09-23	-37.399011	-5.330728	21.173518	-3.465505
997	2002-09-24	-37.767723	-3.607712	21.651700	-3.331427
998	2002-09-25	-36.151342	-2.315392	22.765483	-2.369131
999	2002-09-26	-38.292146	-1.700859	22.142733	-3.808816