-
[python]6. pandas_1인공지능/부스트캠프 Ai Tech 2022. 1. 21. 18:42728x90
[python]6. pandas_1 In [1]:from IPython.core.display import display, HTML display(HTML("<style>.container { width:90% !important; }</style>")) #창 맞추기위함
6. Pandas¶
Tablur¶
attribute, feld, feature, column
In [1]:import pandas as pd import numpy as np from pandas import Series, DataFrame
In [3]:data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' #Data URL df_data = pd.read_csv(data_url, sep='\s+', header = None)
In [4]:df_data
Out[4]:0 1 2 3 4 5 6 7 8 9 10 11 12 13 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 21.0 391.99 9.67 22.4 502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 21.0 396.90 9.08 20.6 503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 396.90 5.64 23.9 504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 393.45 6.48 22.0 505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 396.90 7.88 11.9 506 rows × 14 columns
In [6]:df_data.head()
Out[6]:0 1 2 3 4 5 6 7 8 9 10 11 12 13 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2 In [7]:type(df_data.values)
Out[7]:numpy.ndarray
In [9]:df_data.columns = [ "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV", ] # Column Header 이름 지정 df_data.head()
Out[9]:CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2 Series¶
DataFrame 중 하나의 Column에 해당하는 데이터의 모음 object
In [10]:from pandas import Series, DataFrame
In [11]:list_data = [1,2,3,4,5] example_obj = Series(data = list_data) example_obj
Out[11]:0 1 1 2 2 3 3 4 4 5 dtype: int64
In [12]:list_data = [1,2,3,4,5] list_name = ["a","b","c","d","e"] example_obj = Series(data = list_data, index=list_name) example_obj
Out[12]:a 1 b 2 c 3 d 4 e 5 dtype: int64
In [13]:example_obj.values
Out[13]:array([1, 2, 3, 4, 5], dtype=int64)
In [14]:example_obj["a"]
Out[14]:1
In [16]:example_obj = example_obj.astype(float) example_obj["a"] = 3.2 example_obj
Out[16]:a 3.2 b 2.0 c 3.0 d 4.0 e 5.0 dtype: float64
In [17]:dict_data_1 = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} indexes = ["a", "b", "c", "d", "e", "f", "g", "h"] series_obj_1 = Series(dict_data_1, index=indexes) series_obj_1 # index가 기준으로 됨
Out[17]:a 1.0 b 2.0 c 3.0 d 4.0 e 5.0 f NaN g NaN h NaN dtype: float64
dataframe¶
In [18]:raw_data = { "first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"], "last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"], "age": [42, 52, 36, 24, 73], "city": ["San Francisco", "Baltimore", "Miami", "Douglas", "Boston"], } df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "age", "city"]) df
Out[18]:first_name last_name age city 0 Jason Miller 42 San Francisco 1 Molly Jacobson 52 Baltimore 2 Tina Ali 36 Miami 3 Jake Milner 24 Douglas 4 Amy Cooze 73 Boston In [19]:df.first_name
Out[19]:0 Jason 1 Molly 2 Tina 3 Jake 4 Amy Name: first_name, dtype: object
In [20]:df["first_name"]
Out[20]:0 Jason 1 Molly 2 Tina 3 Jake 4 Amy Name: first_name, dtype: object
In [21]:type(df["first_name"])
Out[21]:pandas.core.series.Series
In [22]:df.loc[1] # loc - index location
Out[22]:first_name Molly last_name Jacobson age 52 city Baltimore Name: 1, dtype: object
In [23]:df['age'].iloc[1:] # iloc - index position
Out[23]:1 52 2 36 3 24 4 73 Name: age, dtype: int64
In [26]:s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5]) s.loc[:3]
Out[26]:49 NaN 48 NaN 47 NaN 46 NaN 45 NaN 1 NaN 2 NaN 3 NaN dtype: float64
In [27]:s.iloc[:3]
Out[27]:49 NaN 48 NaN 47 NaN dtype: float64
In [29]:df.loc[:, ['first_name','last_name']]
Out[29]:first_name last_name 0 Jason Miller 1 Molly Jacobson 2 Tina Ali 3 Jake Milner 4 Amy Cooze In [33]:df['debt'] = df.age > 40 df
Out[33]:first_name last_name age city debt 0 Jason Miller 42 San Francisco True 1 Molly Jacobson 52 Baltimore True 2 Tina Ali 36 Miami False 3 Jake Milner 24 Douglas False 4 Amy Cooze 73 Boston True In [34]:df.T
Out[34]:0 1 2 3 4 first_name Jason Molly Tina Jake Amy last_name Miller Jacobson Ali Milner Cooze age 42 52 36 24 73 city San Francisco Baltimore Miami Douglas Boston debt True True False False True In [35]:df.values
Out[35]:array([['Jason', 'Miller', 42, 'San Francisco', True], ['Molly', 'Jacobson', 52, 'Baltimore', True], ['Tina', 'Ali', 36, 'Miami', False], ['Jake', 'Milner', 24, 'Douglas', False], ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)
In [36]:del df["debt"]
In [37]:df.drop("city", axis=1)
Out[37]:first_name last_name age 0 Jason Miller 42 1 Molly Jacobson 52 2 Tina Ali 36 3 Jake Milner 24 4 Amy Cooze 73 In [38]:df
Out[38]:first_name last_name age city 0 Jason Miller 42 San Francisco 1 Molly Jacobson 52 Baltimore 2 Tina Ali 36 Miami 3 Jake Milner 24 Douglas 4 Amy Cooze 73 Boston In [39]:pop = {"Nevada": {2001: 2.4, 2002: 2.9}, "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}} DataFrame(pop)
Out[39]:Nevada Ohio 2001 2.4 1.7 2002 2.9 3.6 2000 NaN 1.5 Selection & drop¶
In [4]:!pip install xlrd
Requirement already satisfied: xlrd in c:\users\dlsef\anaconda3\lib\site-packages (1.2.0)
In [9]:import numpy as np df = pd.read_csv("wages.csv", sep=",") df.head()
Out[9]:earn height sex race ed age 0 79571.299011 73.89 male white 16 49 1 96396.988643 66.23 female white 16 62 2 48710.666947 63.77 female white 16 33 3 80478.096153 63.22 female other 16 95 4 82089.345498 63.08 female white 17 43 In [10]:df.T
Out[10]:0 1 2 3 4 5 6 7 8 9 ... 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 earn 79571.3 96397 48710.7 80478.1 82089.3 15313.4 47104.2 50960.1 3212.65 42996.6 ... 4755.74 175901 87474 92205.6 16905.6 30173.4 24853.5 13710.7 95426 9575.46 height 73.89 66.23 63.77 63.22 63.08 64.53 61.54 73.29 72.24 72.4 ... 72.94 65.9 68.82 69.62 70.08 71.68 61.31 63.64 71.65 68.22 sex male female female female female female female male male male ... male female male male female male female female male male race white white white other white white white white hispanic white ... hispanic other white white white white white white white white ed 16 16 16 16 17 15 12 17 15 12 ... 15 18 18 18 16 12 18 12 12 12 age 49 62 33 95 43 30 53 50 25 30 ... 24 52 75 57 40 33 86 37 54 31 6 rows × 1379 columns
In [11]:df['earn']
Out[11]:0 79571.299011 1 96396.988643 2 48710.666947 3 80478.096153 4 82089.345498 ... 1374 30173.380363 1375 24853.519514 1376 13710.671312 1377 95426.014410 1378 9575.461857 Name: earn, Length: 1379, dtype: float64
In [12]:df[['earn']] # list를 만들어서 넣어주면 데이터 프레임
Out[12]:earn 0 79571.299011 1 96396.988643 2 48710.666947 3 80478.096153 4 82089.345498 ... ... 1374 30173.380363 1375 24853.519514 1376 13710.671312 1377 95426.014410 1378 9575.461857 1379 rows × 1 columns
In [14]:df[:3]
Out[14]:earn height sex race ed age 0 79571.299011 73.89 male white 16 49 1 96396.988643 66.23 female white 16 62 2 48710.666947 63.77 female white 16 33 In [15]:df['earn'][:3]
Out[15]:0 79571.299011 1 96396.988643 2 48710.666947 Name: earn, dtype: float64
In [19]:df[df['earn']>10000]
Out[19]:earn height sex race ed age 0 79571.299011 73.89 male white 16 49 1 96396.988643 66.23 female white 16 62 2 48710.666947 63.77 female white 16 33 3 80478.096153 63.22 female other 16 95 4 82089.345498 63.08 female white 17 43 ... ... ... ... ... ... ... 1373 16905.557851 70.08 female white 16 40 1374 30173.380363 71.68 male white 12 33 1375 24853.519514 61.31 female white 18 86 1376 13710.671312 63.64 female white 12 37 1377 95426.014410 71.65 male white 12 54 1048 rows × 6 columns
In [20]:df.index = df['sex']
In [22]:del df['sex'] df.head()
Out[22]:earn height race ed age sex male 79571.299011 73.89 white 16 49 female 96396.988643 66.23 white 16 62 female 48710.666947 63.77 white 16 33 female 80478.096153 63.22 other 16 95 female 82089.345498 63.08 white 17 43 In [23]:df[['earn','height']][:2]
Out[23]:earn height sex male 79571.299011 73.89 female 96396.988643 66.23 In [26]:df = df.reset_index()
In [29]:df
Out[29]:sex earn height race ed age 0 male 79571.299011 73.89 white 16 49 1 female 96396.988643 66.23 white 16 62 2 female 48710.666947 63.77 white 16 33 3 female 80478.096153 63.22 other 16 95 4 female 82089.345498 63.08 white 17 43 ... ... ... ... ... ... ... 1374 male 30173.380363 71.68 white 12 33 1375 female 24853.519514 61.31 white 18 86 1376 female 13710.671312 63.64 white 12 37 1377 male 95426.014410 71.65 white 12 54 1378 male 9575.461857 68.22 white 12 31 1379 rows × 6 columns
In [30]:df.loc[[100,500]]
Out[30]:sex earn height race ed age 100 female 48686.335231 62.9 white 12 77 500 female 10536.816469 66.3 white 13 38 In [31]:df.iloc[:10,:3]
Out[31]:sex earn height 0 male 79571.299011 73.89 1 female 96396.988643 66.23 2 female 48710.666947 63.77 3 female 80478.096153 63.22 4 female 82089.345498 63.08 5 female 15313.352901 64.53 6 female 47104.171821 61.54 7 male 50960.054282 73.29 8 male 3212.649556 72.24 9 male 42996.637884 72.40 reindex¶
In [32]:df.reset_index(drop=True) # drop=True 기존 인덱스 삭제
Out[32]:sex earn height race ed age 0 male 79571.299011 73.89 white 16 49 1 female 96396.988643 66.23 white 16 62 2 female 48710.666947 63.77 white 16 33 3 female 80478.096153 63.22 other 16 95 4 female 82089.345498 63.08 white 17 43 ... ... ... ... ... ... ... 1374 male 30173.380363 71.68 white 12 33 1375 female 24853.519514 61.31 white 18 86 1376 female 13710.671312 63.64 white 12 37 1377 male 95426.014410 71.65 white 12 54 1378 male 9575.461857 68.22 white 12 31 1379 rows × 6 columns
In [ ]:df.reset_index(inplace=True) # inplace=True 기존 값 갱신
data.drop¶
In [33]:df.drop(1) df.drop(1, inplace=True)
Out[33]:sex earn height race ed age 0 male 79571.299011 73.89 white 16 49 2 female 48710.666947 63.77 white 16 33 3 female 80478.096153 63.22 other 16 95 4 female 82089.345498 63.08 white 17 43 5 female 15313.352901 64.53 white 15 30 ... ... ... ... ... ... ... 1374 male 30173.380363 71.68 white 12 33 1375 female 24853.519514 61.31 white 18 86 1376 female 13710.671312 63.64 white 12 37 1377 male 95426.014410 71.65 white 12 54 1378 male 9575.461857 68.22 white 12 31 1378 rows × 6 columns
In [34]:df.drop("sex", axis=1)
Out[34]:earn height race ed age 0 79571.299011 73.89 white 16 49 1 96396.988643 66.23 white 16 62 2 48710.666947 63.77 white 16 33 3 80478.096153 63.22 other 16 95 4 82089.345498 63.08 white 17 43 ... ... ... ... ... ... 1374 30173.380363 71.68 white 12 33 1375 24853.519514 61.31 white 18 86 1376 13710.671312 63.64 white 12 37 1377 95426.014410 71.65 white 12 54 1378 9575.461857 68.22 white 12 31 1379 rows × 5 columns
In [35]:df.values
Out[35]:array([['male', 79571.299011024, 73.89, 'white', 16, 49], ['female', 96396.9886433106, 66.23, 'white', 16, 62], ['female', 48710.66694739099, 63.77, 'white', 16, 33], ..., ['female', 13710.6713116427, 63.64, 'white', 12, 37], ['male', 95426.0144102907, 71.65, 'white', 12, 54], ['male', 9575.46185684499, 68.22, 'white', 12, 31]], dtype=object)
In [40]:s1 = pd.Series(range(1,6), index=list("abcde")) s1
Out[40]:a 1 b 2 c 3 d 4 e 5 dtype: int64
In [41]:s2 = pd.Series(range(5,11), index=list("bcedef")) s2
Out[41]:b 5 c 6 e 7 d 8 e 9 f 10 dtype: int64
index를 기준으로 연산을 수행하여, 겹치는 index가 없으면 NaN값으로 반환된다.
In [42]:s1.add(s2)
Out[42]:a NaN b 7.0 c 9.0 d 12.0 e 12.0 e 14.0 f NaN dtype: float64
In [43]:s1 + s2
Out[43]:a NaN b 7.0 c 9.0 d 12.0 e 12.0 e 14.0 f NaN dtype: float64
In [44]:df1 = pd.DataFrame( np.arange(9).reshape(3,3), columns=list('abc')) df1
Out[44]:a b c 0 0 1 2 1 3 4 5 2 6 7 8 In [45]:df2 = pd.DataFrame( np.arange(16).reshape(4,4), columns=list('abcd')) df2
Out[45]:a b c d 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 3 12 13 14 15 In [46]:df1 + df2 # index, columns이 동시에 맞지 않아 NaN으로 처리
Out[46]:a b c d 0 0.0 2.0 4.0 NaN 1 7.0 9.0 11.0 NaN 2 14.0 16.0 18.0 NaN 3 NaN NaN NaN NaN fill_value를 사용하면 NaN값이 안나오게 한다.
In [47]:df1.add(df2, fill_value=0)
Out[47]:a b c d 0 0.0 2.0 4.0 3.0 1 7.0 9.0 11.0 7.0 2 14.0 16.0 18.0 11.0 3 12.0 13.0 14.0 15.0 In [48]:df = pd.DataFrame( np.arange(16).reshape(4,4), columns=list('abcd')) df
Out[48]:a b c d 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 3 12 13 14 15 In [50]:s2 = pd.Series(np.arange(10,14)) s2
Out[50]:0 10 1 11 2 12 3 13 dtype: int32
In [51]:df + s2
Out[51]:a b c d 0 1 2 3 0 NaN NaN NaN NaN NaN NaN NaN NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN 2 NaN NaN NaN NaN NaN NaN NaN NaN 3 NaN NaN NaN NaN NaN NaN NaN NaN In [52]:df.add(s2, axis=0) # axis를 기준으로 더해줘야 한다. 이때 broadcasting 발생
Out[52]:a b c d 0 10 11 12 13 1 15 16 17 18 2 20 21 22 23 3 25 26 27 28 lambda, map, apply¶
In [57]:s1 = Series(np.arange(10)) s1.head(5)
Out[57]:0 0 1 1 2 2 3 3 4 4 dtype: int32
In [58]:s1.map(lambda x: x**2).head(5)
Out[58]:0 0 1 1 2 4 3 9 4 16 dtype: int64
In [70]:def f(x): return x + 5 s1.map(f)
Out[70]:0 5 1 6 2 7 3 8 4 9 5 10 6 11 7 12 8 13 9 14 dtype: int64
In [59]:z = {1: 'A', 2: 'B', 3: 'C'} s1.map(z).head(5)
Out[59]:0 NaN 1 A 2 B 3 C 4 NaN dtype: object
In [60]:s2 = Series(np.arange(10,20)) s1.map(s2).head(5) # index를 기준으로
Out[60]:0 10 1 11 2 12 3 13 4 14 dtype: int32
In [62]:df = pd.read_csv("wages.csv")
In [64]:df['sex'].unique()
Out[64]:array(['male', 'female'], dtype=object)
In [65]:df['sex_code'] = df['sex'].map({"male": 0, "female": 1}) df.head(5)
Out[65]:earn height sex race ed age sex_code 0 79571.299011 73.89 male white 16 49 0 1 96396.988643 66.23 female white 16 62 1 2 48710.666947 63.77 female white 16 33 1 3 80478.096153 63.22 female other 16 95 1 4 82089.345498 63.08 female white 17 43 1 - replace : Map 함수의 기능중 데이터 변환 기능만 담당, 데이터 변환시 많이 사용하는 함수
In [66]:df.sex.replace({"male":0, "female":1}) # dict type df.head()
Out[66]:earn height sex race ed age sex_code 0 79571.299011 73.89 male white 16 49 0 1 96396.988643 66.23 female white 16 62 1 2 48710.666947 63.77 female white 16 33 1 3 80478.096153 63.22 female other 16 95 1 4 82089.345498 63.08 female white 17 43 1 In [67]:df.sex.replace( ['male','female'], # Target List, Convertion List [0,1], inplace=True) df.head(5)
Out[67]:earn height sex race ed age sex_code 0 79571.299011 73.89 0 white 16 49 0 1 96396.988643 66.23 1 white 16 62 1 2 48710.666947 63.77 1 white 16 33 1 3 80478.096153 63.22 1 other 16 95 1 4 82089.345498 63.08 1 white 17 43 1 In [68]:s1
Out[68]:0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int32
- apply : map과 달리, series 전체에 해당 함수를 적용 column 단위로 적용한다.
In [71]:df_info = df[['earn','height','age']] df_info.head()
Out[71]:earn height age 0 79571.299011 73.89 49 1 96396.988643 66.23 62 2 48710.666947 63.77 33 3 80478.096153 63.22 95 4 82089.345498 63.08 43 In [72]:f = lambda x: x.max() - x.min() df_info.apply(f)
Out[72]:earn 318047.708444 height 19.870000 age 73.000000 dtype: float64
In [73]:df_info.sum()
Out[73]:earn 4.474344e+07 height 9.183125e+04 age 6.250800e+04 dtype: float64
In [74]:df_info.apply(sum)
Out[74]:earn 4.474344e+07 height 9.183125e+04 age 6.250800e+04 dtype: float64
Scalar 외에 Series 값의 반환도 가능
In [76]:def f(x): return Series([x.min(), x.max()], index=["min","max"]) df_info.apply(f)
Out[76]:earn height age min -98.580489 57.34 22 max 317949.127955 77.21 95 - applymap : series 단위가 아닌 element 단위로 함수를 적용, series 단위에 apply를 적용시킬 떄와 같은 효과
In [77]:f = lambda x: -x df_info.applymap(f).head() # 모든 값에 적용됨
Out[77]:earn height age 0 -79571.299011 -73.89 -49 1 -96396.988643 -66.23 -62 2 -48710.666947 -63.77 -33 3 -80478.096153 -63.22 -95 4 -82089.345498 -63.08 -43 In [78]:df_info['earn'].apply(f).head(5)
Out[78]:0 -79571.299011 1 -96396.988643 2 -48710.666947 3 -80478.096153 4 -82089.345498 Name: earn, dtype: float64
describe¶
Numeric type 데이터의 요약정보를 보여줌
In [2]:df = pd.read_csv("wages.csv") df.head()
Out[2]:earn height sex race ed age 0 79571.299011 73.89 male white 16 49 1 96396.988643 66.23 female white 16 62 2 48710.666947 63.77 female white 16 33 3 80478.096153 63.22 female other 16 95 4 82089.345498 63.08 female white 17 43 In [3]:df.describe()
Out[3]:earn height ed age count 1379.000000 1379.000000 1379.000000 1379.000000 mean 32446.292622 66.592640 13.354605 45.328499 std 31257.070006 3.818108 2.438741 15.789715 min -98.580489 57.340000 3.000000 22.000000 25% 10538.790721 63.720000 12.000000 33.000000 50% 26877.870178 66.050000 13.000000 42.000000 75% 44506.215336 69.315000 15.000000 55.000000 max 317949.127955 77.210000 18.000000 95.000000 - unique : series data의 유일한 값 list를 반환함
In [4]:df.race.unique() # 유일한 인종의 값 list
Out[4]:array(['white', 'other', 'hispanic', 'black'], dtype=object)
In [5]:np.array(dict(enumerate(df['race'].unique()))) # dict type으로 index
Out[5]:array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)
In [6]:value = list(map(int, np.array(list(enumerate(df['race'].unique())))[:,0].tolist())) key = np.array(list(enumerate(df['race'].unique())), dtype=str)[:, 1].tolist() value, key # label index 값과 label 값 각각 추출
Out[6]:([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])
In [9]:df['race'].replace(to_replace=key, value=value, inplace=True) # race의 단어를 숫자로 변환, label-encoding
In [11]:df['race']
Out[11]:0 0 1 0 2 0 3 1 4 0 .. 1374 0 1375 0 1376 0 1377 0 1378 0 Name: race, Length: 1379, dtype: int64
- sum : column, row 연산 지원
In [12]:df.sum(axis=0)
Out[12]:earn 4.47434e+07 height 91831.3 sex malefemalefemalefemalefemalefemalefemalemalema... race 561 ed 18416 age 62508 dtype: object
In [13]:df.sum(axis=1)
Out[13]:0 79710.189011 1 96541.218643 2 48823.436947 3 80653.316153 4 82212.425498 ... 1374 30290.060363 1375 25018.829514 1376 13823.311312 1377 95563.664410 1378 9686.681857 Length: 1379, dtype: float64
In [25]:numeric_col = ['earn', 'height','ed','age'] # 체크! df[numeric_col]
Out[25]:earn height ed age 0 79571.299011 73.89 16 49 1 96396.988643 66.23 16 62 2 48710.666947 63.77 16 33 3 80478.096153 63.22 16 95 4 82089.345498 63.08 17 43 ... ... ... ... ... 1374 30173.380363 71.68 12 33 1375 24853.519514 61.31 18 86 1376 13710.671312 63.64 12 37 1377 95426.014410 71.65 12 54 1378 9575.461857 68.22 12 31 1379 rows × 4 columns
In [26]:df[numeric_col].sum(axis=1)
Out[26]:0 79710.189011 1 96541.218643 2 48823.436947 3 80652.316153 4 82212.425498 ... 1374 30290.060363 1375 25018.829514 1376 13823.311312 1377 95563.664410 1378 9686.681857 Length: 1379, dtype: float64
In [27]:df[numeric_col].sum(axis=0)
Out[27]:earn 4.474344e+07 height 9.183125e+04 ed 1.841600e+04 age 6.250800e+04 dtype: float64
- isnull : null 값 반환
In [14]:df.isnull()
Out[14]:earn height sex race ed age 0 False False False False False False 1 False False False False False False 2 False False False False False False 3 False False False False False False 4 False False False False False False ... ... ... ... ... ... ... 1374 False False False False False False 1375 False False False False False False 1376 False False False False False False 1377 False False False False False False 1378 False False False False False False 1379 rows × 6 columns
In [15]:df.isnull().sum()
Out[15]:earn 0 height 0 sex 0 race 0 ed 0 age 0 dtype: int64
- sort_values : columns 값을 기준으로 데이터를 sorting
In [17]:df.sort_values(['age','earn'], ascending=True).head(10)
Out[17]:earn height sex race ed age 1038 -56.321979 67.81 male 2 10 22 800 -27.876819 72.29 male 0 12 22 963 -25.655260 68.90 male 0 12 22 1105 988.565070 64.71 female 0 12 22 801 1000.221504 64.09 female 0 12 22 862 1002.023843 66.59 female 0 12 22 933 1007.994941 68.26 female 0 12 22 988 1578.542814 64.53 male 0 12 22 522 1955.168187 69.87 female 3 12 22 765 2581.870402 64.79 female 0 12 22 - Correlation & Covariance : 상관계수와 공분산을 구하는 함수
In [19]:df.age.corr(df.earn)
Out[19]:0.07400349177836058
In [20]:df.age.cov(df.earn)
Out[20]:36523.6992104089
In [22]:df.corrwith(df.earn) # df 와 df.earn의 corr을 볼수 있다.
Out[22]:earn 1.000000 height 0.291600 race -0.063977 ed 0.350374 age 0.074003 dtype: float64
In [23]:df.corr()
Out[23]:earn height race ed age earn 1.000000 0.291600 -0.063977 0.350374 0.074003 height 0.291600 1.000000 -0.045974 0.114047 -0.133727 race -0.063977 -0.045974 1.000000 -0.049487 -0.056879 ed 0.350374 0.114047 -0.049487 1.000000 -0.129802 age 0.074003 -0.133727 -0.056879 -0.129802 1.000000 In [28]:df.age > 15
Out[28]:0 True 1 True 2 True 3 True 4 True ... 1374 True 1375 True 1376 True 1377 True 1378 True Name: age, Length: 1379, dtype: bool
In [29]:(df.age < 45) & (df.age > 15)
Out[29]:0 False 1 False 2 True 3 False 4 True ... 1374 True 1375 False 1376 True 1377 False 1378 True Name: age, Length: 1379, dtype: bool
In [33]:df['age'][(df.age < 45) & (df.age > 15)]
Out[33]:2 33 4 43 5 30 8 25 9 30 .. 1369 24 1373 40 1374 33 1376 37 1378 31 Name: age, Length: 772, dtype: int64
In [31]:df['age'][(df.age < 45) & (df.age > 15)].corr(df.earn)
Out[31]:0.3141178872518904
In [ ]:In [ ]:'인공지능 > 부스트캠프 Ai Tech' 카테고리의 다른 글
[Pytorch]1. PyTorch Basics (0) 2022.01.28 [python]6. pandas_2 (0) 2022.01.21 [python]5. Numpy (0) 2022.01.21 [python]4. Python Data Handling (0) 2022.01.21 [python]3. Exception_File_LogHandling (0) 2022.01.21