6_판다스2

2022-06-02 12 분 소요

import warnings
warnings.filterwarnings('ignore')
from IPython.display import Image
import numpy as np
import pandas as pd

#판다스의 데이터프레임과 시리즈
#데이터프레임과 시리즈는 딕셔너리와 비슷하지만 데이터를 다루는데 더 특화가 되어있다.

#파이썬의 리스트와 딕셔너리는 많은 양의 데이터를 저장하거나 조작할 수 있는 함수가
#많지 않다.
#반면에 판다스의 데이터프레임과 시리즈는 많은 양의 데이터를 저장할수 있을 뿐만
#아니라 스프레드시트 프로그램(엑셀 등)을 사용하는 것 처럼 행과 열 단위로 데이터를
#조작할 수 있는 다양한 속성과 함수를 제공한다.

#시리즈 만들기
s = pd.Series(['홍길동', '이몽룡'])
print(type(s))
print(s)
#인덱스에 숫자 뿐만 아니라 문자도 넣을수 있다.
#시리즈르 생성할 때 문자열을 인덱스로 지정할 수 있다.
#문자열을 인ㄷ게스로 지정하려면 Series()함수의 index속성에
#인덱스로 사용하려는 문자열을 리스트에 담아 전달하면 된다.
s = pd.Series(['홍길동', '이몽룡'], index=['도적넘', '공무원'])
print(s)

<class 'pandas.core.series.Series'>
0    홍길동
1    이몽룡
dtype: object
도적넘    홍길동
공무원    이몽룡
dtype: object

데이터프레임 만들기

#데이터프레임을 만들기 위해서는 딕셔너리를 DataFrame() 함수의 인수로 전달하면 된다.
df = pd.DataFrame({
    #'key' :  [value ,value, ...]
    # key는 데이터프레임의 열 이름이 되고 value는 데이터가 된다.
    'name' : ['홍길동', '임꺽정'],
    'job' : ['도둑넘', '도적넘'],
    'born' : ['1920-04-20', '1875-06-10'],
    'died' : ['1957-08-20', '1930-12-10'],
    'age' : [37, 60]
})
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>

	name	job	born	died	age
0	홍길동	도둑넘	1920-04-20	1957-08-20	37
1	임꺽정	도적넘	1875-06-10	1930-12-10	60

#DataFrame()함수의 data속성에 데이터,index 속성에 인덱스,
#column 속성에 열 이름을 지정할 수 있다.
df = pd.DataFrame(
     data={
        'job' : ['도둑넘', '도적넘'],
        'born' : ['1920-04-20', '1875-06-10'],
        'died' : ['1957-08-20', '1930-12-10'],
        'age' : [37, 60]            
     },
    
    index = ['홍길동', '임꺽정'],
    
    #columns의 개수를 조절해서 화면에 보이기 가능하다.
    #columns = ['job','age']
)
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>

	job	born	died	age
홍길동	도둑넘	1920-04-20	1957-08-20	37
임꺽정	도적넘	1875-06-10	1930-12-10	60

시리즈 다루기

#데이터프레임에서 시리즈를 선택하려면 loc, iloc속성에 인덱스를 전달하면 된다.
row = df.loc['임꺽정']
print(type(row))
print(row)

row = df.iloc[0]
print(type(row))
print(row)

<class 'pandas.core.series.Series'>
job            도적넘
born    1875-06-10
died    1930-12-10
age             60
Name: 임꺽정, dtype: object
<class 'pandas.core.series.Series'>
job            도둑넘
born    1920-04-20
died    1957-08-20
age             37
Name: 홍길동, dtype: object

#열단위로 시리즈를 축출한다.
#(':'이 있어야 모든 행을 대상으로 가능하다.)
col = df.loc[:,'job']
print(type(col))
print(col)

col = df.iloc[:,3]
print(type(col))
print(col)

<class 'pandas.core.series.Series'>
홍길동    도둑넘
임꺽정    도적넘
Name: job, dtype: object
<class 'pandas.core.series.Series'>
홍길동    37
임꺽정    60
Name: age, dtype: int64

#열단위로 시리즈를 축출한다.
#(loc, iloc미사용)
col = df['job']
print(type(col))
print(col)

col = df.get('job')
print(type(col))
print(col)

#한개의 열만 얻어올 때 사용
col = df.job
print(type(col))
print(col)

<class 'pandas.core.series.Series'>
홍길동    도둑넘
임꺽정    도적넘
Name: job, dtype: object
<class 'pandas.core.series.Series'>
홍길동    도둑넘
임꺽정    도적넘
Name: job, dtype: object
<class 'pandas.core.series.Series'>
홍길동    도둑넘
임꺽정    도적넘
Name: job, dtype: object

index. values속성과 keys()함수

#index 속성은 시리즈의 인덱스를 얻어온다.
print(type(row.index))
print(row.index)
    
print(type(col.index))
print(col.index)

<class 'pandas.core.indexes.base.Index'>
Index(['job', 'born', 'died', 'age'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
Index(['홍길동', '임꺽정'], dtype='object')

#values 속성은 시리즈의 데이터를 얻어온다.
print(type(row.values))
print(row.values)

print(type(col.values))
print(col.values)

<class 'numpy.ndarray'>
['도둑넘' '1920-04-20' '1957-08-20' 37]
<class 'numpy.ndarray'>
[37 60]

#keys() 함수는 index속성(데이터 구분하기 위한 것들, 반드시 숫자일 필요는 없다.)과 똑같이 시리즈의 인덱스를 얻어온다.
print(type(row.keys))
print(row.keys)

print(type(col.keys))
print(col.keys)

<class 'method'>
<bound method Series.keys of job            도둑넘
born    1920-04-20
died    1957-08-20
age             37
Name: 홍길동, dtype: object>
<class 'method'>
<bound method Series.keys of 홍길동    37
임꺽정    60
Name: age, dtype: int64>

#index,values 속성의 특정 위치의 값 추출하기 
print(row.index[0])
print(col.index[0])
print(row.values[0])
print(col.values[0])

job
홍길동
도둑넘
37

시리즈 기초 통계함수

scientists = pd.read_csv("./data/scientists.csv")
print(type(scientists))
scientists

<class 'pandas.core.frame.DataFrame'>

	Name	Born	Died	Age	Occupation
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist
5	John Snow	1813-03-15	1858-06-16	45	Physician
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

ages = scientists['Age']
print(type(ages))
print(ages)

print('sum() = {}'.format(ages.sum()))
print('mean() = {}'.format(ages.mean()))
print('max() = {}'.format(ages.max()))
print('min() = {}'.format(ages.min()))
print('count() = {}'.format(ages.count()))

#std는 표준편차이다.
print('std() = {}'.format(ages.std()))

<class 'pandas.core.series.Series'>
0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
sum() = 473
mean() = 59.125
max() = 90
min() = 37
count() = 8
std() = 18.325918413937288

#브로드캐스팅
#ages > ages.mean()와 같이 시리즈나 데이터프레임의 모든 데이터에 대해
#한번에 연산하는 것을 브로드캐스팅이라 한다.
#이 때, 여러개의 값을 가진 데이터를 벡터라 하고 단순 크기를 나타내는
#데이터를 스칼라라 부른다.

#평균나이 보다 높은 나이를 가진 사람의 데이터를 축출한다.
print(ages > ages.mean())
bool_value = ages > ages.mean()
print(bool_value)
print(ages[bool_value])
print(ages[[False,False,True,True,False,False,False,True]])

  False
   True
   True
   True
  False
  False
  False
   True
Name: Age, dtype: bool
  False
   True
   True
   True
  False
  False
  False
   True
Name: Age, dtype: bool
  61
  90
  66
  77
Name: Age, dtype: int64
  90
  66
  77
Name: Age, dtype: int64

print(ages[ages > ages.mean()])

  61
  90
  66
  77
Name: Age, dtype: int64

for age in ages:
    print('{:4d}'.format(age), end=' ')
print()

#같은 길이의 백터로 연산을 하면 결과값을
#같은 길이의 백터가 출력된다.
for age in ages + ages:
    print('{:4d}'.format(age), end=' ')
print()

for age in ages * ages:
    print('{:4d}'.format(age), end=' ')
print()

#백터와 스칼라 값을 연산하면 백터 
#각각의 값에 스클라 값이 연산된 결과가
#출력된다.
for age in ages + 100:
    print('{:4d}'.format(age), end=' ')
print()
for age in ages * 3:
    print('{:4d}'.format(age), end=' ')
print()

 61   90   66   56   45   41   77 
122  180  132  112   90   82  154 
3721 8100 4356 3136 2025 1681 5929 
161  190  166  156  145  141  177 
183  270  198  168  135  123  231 

print(pd.Series([1,100]))

#길이가 서로 다른 벡터를 연산하면
#같은 인덱스를 가지는 값만 계산된다.
#pd.Series([1,100])의 인덱스만 계산되고
#나머지는 계산할 수 없으므로 누락값(Null, 결축치)
#으로 처리된다.
print(ages + pd.Series([1,100]))

    1
  100
dtype: int64
   38.0
  161.0
    NaN
    NaN
    NaN
    NaN
    NaN
    NaN
dtype: float64

print(ages)
#ages.sort_index()
#=>데이터가 아닌 인덱스를 정렬한다.

#정렬을 하면 기본값으로 오름차순으로 정렬하는데
#내림차순 정렬을 원한다면, ascending=False속성을

print(ages.sort_index(ascending=False))

#ages.sort_values()
#=>인덱스가 아닌 데이터를 정렬한다.
print(ages.sort_values())
print(ages.sort_values(ascending=False))

  37
  61
  90
  66
  56
  45
  41
  77
Name: Age, dtype: int64
  77
  41
  45
  56
  66
  90
  61
  37
Name: Age, dtype: int64
  37
  41
  45
  56
  61
  66
  77
  90
Name: Age, dtype: int64
  90
  77
  66
  61
  56
  45
  41
  37
Name: Age, dtype: int64

print(ages)
rev_ages = ages.sort_index(ascending = False)
print(rev_ages)

#백터와 백터의 연산은 같은 인덱스끼리 연산이 이루어진다.
#(정렬 유무는 무관하다.)
for age in ages + rev_ages:
    print('{:4d}'.format(age), end=' ')
print()

  37
  61
  90
  66
  56
  45
  41
  77
Name: Age, dtype: int64
  77
  41
  45
  56
  66
  90
  61
  37
Name: Age, dtype: int64
122  180  132  112   90   82  154 

데이터프레임도 시리즈와 마찬가지로 브로드캐스팅과 인덱싱을 할 수 있다.

scientists

	Name	Born	Died	Age	Occupation
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist
5	John Snow	1813-03-15	1858-06-16	45	Physician
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

#print(ages[ages > ages.mean()])
scientists[scientists['Age'] > scientists['Age'].mean()] 

	Name	Born	Died	Age	Occupation
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

scientists[scientists['Age'] > scientists.get('Age').mean()] 

	Name	Born	Died	Age	Occupation
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

scientists[scientists['Age'] > scientists.Age.mean()] 

	Name	Born	Died	Age	Occupation
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

#데이터프레임에 곱하기 연산을 하면 문자열은
#곱해진 숫자만큼 반복되고, 숫자는 곱하기 연산이
#실행된다.
scientists * 2

	Name	Born	Died	Age	Occupation
0	Rosaline FranklinRosaline Franklin	1920-07-251920-07-25	1958-04-161958-04-16	74	ChemistChemist
1	William GossetWilliam Gosset	1876-06-131876-06-13	1937-10-161937-10-16	122	StatisticianStatistician
2	Florence NightingaleFlorence Nightingale	1820-05-121820-05-12	1910-08-131910-08-13	180	NurseNurse
3	Marie CurieMarie Curie	1867-11-071867-11-07	1934-07-041934-07-04	132	ChemistChemist
4	Rachel CarsonRachel Carson	1907-05-271907-05-27	1964-04-141964-04-14	112	BiologistBiologist
5	John SnowJohn Snow	1813-03-151813-03-15	1858-06-161858-06-16	90	PhysicianPhysician
6	Alan TuringAlan Turing	1912-06-231912-06-23	1954-06-071954-06-07	82	Computer ScientistComputer Scientist
7	Johann GaussJohann Gauss	1777-04-301777-04-30	1855-02-231855-02-23	154	MathematicianMathematician

데이터프레임 열의 자료형 바꾸기

scientists

	Name	Born	Died	Age	Occupation
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist
5	John Snow	1813-03-15	1858-06-16	45	Physician
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

#판다스에서는 문자열을 object라 한다.
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

#판다스에서는 문자열을 object라 한다.
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        8 non-null      object
 1   Born        8 non-null      object
 2   Died        8 non-null      object
 3   Age         8 non-null      int64 
 4   Occupation  8 non-null      object
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes

#dtypes대신 dtype해도 된다(단,복수 구분).
print(scientists['Born'].dtypes)
print(scientists['Died'].dtypes)
print(scientists['Age'].dtypes)

object
object
int64

#날짜를 문자열 형태로 저장한 데이터는 날짜 및
#시간에 관련된 작업을 할 수 있도록 
#to_datetime()함수로 datetime타입으로 바꿔서 
#작업한다.
born_datetime = pd.to_datetime(scientists['Born'])
print(born_datetime)

died_datetime = pd.to_datetime(scientists['Died'])
print(died_datetime)

 1920-07-25
 1876-06-13
 1820-05-12
 1867-11-07
 1907-05-27
 1813-03-15
 1912-06-23
 1777-04-30
Name: Born, dtype: datetime64[ns]
 1958-04-16
 1937-10-16
 1910-08-13
 1934-07-04
 1964-04-14
 1858-06-16
 1954-06-07
 1855-02-23
Name: Died, dtype: datetime64[ns]

파생 변수(계산에 의한 변수)

scientists = pd.read_csv('./data/scientists.csv')

#datetime타입으로 변경한 born_datetime, died_datetime를
#scientists 데이터프레임에 열로 추가한다.
scientists

	Name	Born	Died	Age	Occupation
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist
1	William Gosset	1876-06-13	1937-10-16	61	Statistician
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist
5	John Snow	1813-03-15	1858-06-16	45	Physician
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician

#데이터프레임['추가할 열 이름'] = 추가할 데이터
#scientists['born_df'] = born_datetime
#scientists['died_df'] = died_datetime

#튜플로 사용해서 넣어도 된다.
scientists['born_df'], scientists['died_df'] = born_datetime, died_datetime
scientists

	Name	Born	Died	Age	Occupation	born_df	died_df
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist	1920-07-25	1958-04-16
1	William Gosset	1876-06-13	1937-10-16	61	Statistician	1876-06-13	1937-10-16
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse	1820-05-12	1910-08-13
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist	1867-11-07	1934-07-04
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist	1907-05-27	1964-04-14
5	John Snow	1813-03-15	1858-06-16	45	Physician	1813-03-15	1858-06-16
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist	1912-06-23	1954-06-07
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician	1777-04-30	1855-02-23

scientists['born_df_2'] = pd.to_datetime(scientists['Born'])
scientists['died_df_2'] = pd.to_datetime(scientists['Died'])
scientists

	Name	Born	Died	Age	Occupation	born_df	died_df	born_df_2	died_df_2
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist	1920-07-25	1958-04-16	1920-07-25	1958-04-16
1	William Gosset	1876-06-13	1937-10-16	61	Statistician	1876-06-13	1937-10-16	1876-06-13	1937-10-16
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse	1820-05-12	1910-08-13	1820-05-12	1910-08-13
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist	1867-11-07	1934-07-04	1867-11-07	1934-07-04
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist	1907-05-27	1964-04-14	1907-05-27	1964-04-14
5	John Snow	1813-03-15	1858-06-16	45	Physician	1813-03-15	1858-06-16	1813-03-15	1858-06-16
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist	1912-06-23	1954-06-07	1912-06-23	1954-06-07
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician	1777-04-30	1855-02-23	1777-04-30	1855-02-23

#문자열(scientists['Died'], scientists['Born'])은 '-'연산을 할 수 없다.
#print(scientists['Died'] - scientists['Born'])=>에러 발생
#to_datetime()함수로 문자열 형태의 날짜 데이터를 날짜 데이터로 변환한
#파생 변수 born_df, died_df를 사용하면 얼마동안 세상을 살다 떠났는지
#계산할 수 있다.
scientists['age_days_df'] = scientists['died_df'] - scientists['born_df']
scientists

	Name	Born	Died	Age	Occupation	born_df	died_df	born_df_2	died_df_2	age_days_df
0	Rosaline Franklin	1920-07-25	1958-04-16	37	Chemist	1920-07-25	1958-04-16	1920-07-25	1958-04-16	13779 days
1	William Gosset	1876-06-13	1937-10-16	61	Statistician	1876-06-13	1937-10-16	1876-06-13	1937-10-16	22404 days
2	Florence Nightingale	1820-05-12	1910-08-13	90	Nurse	1820-05-12	1910-08-13	1820-05-12	1910-08-13	32964 days
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist	1867-11-07	1934-07-04	1867-11-07	1934-07-04	24345 days
4	Rachel Carson	1907-05-27	1964-04-14	56	Biologist	1907-05-27	1964-04-14	1907-05-27	1964-04-14	20777 days
5	John Snow	1813-03-15	1858-06-16	45	Physician	1813-03-15	1858-06-16	1813-03-15	1858-06-16	16529 days
6	Alan Turing	1912-06-23	1954-06-07	41	Computer Scientist	1912-06-23	1954-06-07	1912-06-23	1954-06-07	15324 days
7	Johann Gauss	1777-04-30	1855-02-23	77	Mathematician	1777-04-30	1855-02-23	1777-04-30	1855-02-23	28422 days

시리즈, 데이터프레임 열 섞기

import random as r

#shuffle()
#=>데이터를 무작위로 섞어준다.

#scientists데이터프레임에 Age열만 섞인다.(위에 표와 비교해보자.)
r.shuffle(scientists['Age'])
scientists

	Name	Born	Died	Age	Occupation	born_df	died_df	born_df_2	died_df_2	age_days_df
0	Rosaline Franklin	1920-07-25	1958-04-16	61	Chemist	1920-07-25	1958-04-16	1920-07-25	1958-04-16	13779 days
1	William Gosset	1876-06-13	1937-10-16	41	Statistician	1876-06-13	1937-10-16	1876-06-13	1937-10-16	22404 days
2	Florence Nightingale	1820-05-12	1910-08-13	77	Nurse	1820-05-12	1910-08-13	1820-05-12	1910-08-13	32964 days
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist	1867-11-07	1934-07-04	1867-11-07	1934-07-04	24345 days
4	Rachel Carson	1907-05-27	1964-04-14	90	Biologist	1907-05-27	1964-04-14	1907-05-27	1964-04-14	20777 days
5	John Snow	1813-03-15	1858-06-16	56	Physician	1813-03-15	1858-06-16	1813-03-15	1858-06-16	16529 days
6	Alan Turing	1912-06-23	1954-06-07	37	Computer Scientist	1912-06-23	1954-06-07	1912-06-23	1954-06-07	15324 days
7	Johann Gauss	1777-04-30	1855-02-23	45	Mathematician	1777-04-30	1855-02-23	1777-04-30	1855-02-23	28422 days

데이터프레임 열 삭제하기

#drop() 함수는 데이터프레임의 열을 삭제한다.
#drop([삭제할 열 이름], axis=1),
#삭제할 열이 2개 이상일 경우 리스트에 넣어줘야 한다.
#drop()함수는 axis=0이 기본값이고 axis=0으로 지정하면
#행을 저게하려 하기 때문에 에러가 발생되므로 반드시
#1로 지정해서 사용해야 한다.
scientists_dropped = scientists.drop(['Born', 'Died'], axis=1)
scientists_dropped

	Name	Age	Occupation	born_df	died_df	born_df_2	died_df_2	age_days_df
0	Rosaline Franklin	61	Chemist	1920-07-25	1958-04-16	1920-07-25	1958-04-16	13779 days
1	William Gosset	41	Statistician	1876-06-13	1937-10-16	1876-06-13	1937-10-16	22404 days
2	Florence Nightingale	77	Nurse	1820-05-12	1910-08-13	1820-05-12	1910-08-13	32964 days
3	Marie Curie	66	Chemist	1867-11-07	1934-07-04	1867-11-07	1934-07-04	24345 days
4	Rachel Carson	90	Biologist	1907-05-27	1964-04-14	1907-05-27	1964-04-14	20777 days
5	John Snow	56	Physician	1813-03-15	1858-06-16	1813-03-15	1858-06-16	16529 days
6	Alan Turing	37	Computer Scientist	1912-06-23	1954-06-07	1912-06-23	1954-06-07	15324 days
7	Johann Gauss	45	Mathematician	1777-04-30	1855-02-23	1777-04-30	1855-02-23	28422 days

데이터를 피클, csv, tsv 파일로 저장하고 불러오기

#피클은 데이터를 바이너리 형태로 직렬화한 오브젝트를
#저장하는 방법으로 데이터를 오래 보관한다는 것을 의미한다.
#그래서 피클이라 명명했다.

#피클로 저장하면 스프레드시트보다 더 작은 용량으로 데이터를
#저장할 수 있다.

names = scientists['Name']
print(names)

     Rosaline Franklin
        William Gosset
  Florence Nightingale
           Marie Curie
         Rachel Carson
             John Snow
           Alan Turing
          Johann Gauss
Name: Name, dtype: object

#시리즈나 데이터프레임을 피클로 저장할 때 to_pickle()함수를 사용한다.
#저장시 파일이름 뒤에 확장자를 안붙여도 된다.
names.to_pickle('./output/scientists_name_series.pickle')

#피클은 바이너리 형태의 오브젝트이기 때문에 저장된 피클을
#메모장 같은 프로그램에서 열면 이상한 문자가 나타난다.

#피클을 읽어들이려면 read_pickle()함수를 사용한다.
print(pd.read_pickle('./output/scientists_name_series.pickle'))

     Rosaline Franklin
        William Gosset
  Florence Nightingale
           Marie Curie
         Rachel Carson
             John Snow
           Alan Turing
          Johann Gauss
Name: Name, dtype: object

scientists.to_pickle('./output/scientists_df.pickle')
#데이터프레임은 print사용하지 않는것이 더 깔끔하게 출력된다.
pd.read_pickle('./output/scientists_df.pickle')

	Name	Born	Died	Age	Occupation	born_df	died_df	born_df_2	died_df_2	age_days_df
0	Rosaline Franklin	1920-07-25	1958-04-16	61	Chemist	1920-07-25	1958-04-16	1920-07-25	1958-04-16	13779 days
1	William Gosset	1876-06-13	1937-10-16	41	Statistician	1876-06-13	1937-10-16	1876-06-13	1937-10-16	22404 days
2	Florence Nightingale	1820-05-12	1910-08-13	77	Nurse	1820-05-12	1910-08-13	1820-05-12	1910-08-13	32964 days
3	Marie Curie	1867-11-07	1934-07-04	66	Chemist	1867-11-07	1934-07-04	1867-11-07	1934-07-04	24345 days
4	Rachel Carson	1907-05-27	1964-04-14	90	Biologist	1907-05-27	1964-04-14	1907-05-27	1964-04-14	20777 days
5	John Snow	1813-03-15	1858-06-16	56	Physician	1813-03-15	1858-06-16	1813-03-15	1858-06-16	16529 days
6	Alan Turing	1912-06-23	1954-06-07	37	Computer Scientist	1912-06-23	1954-06-07	1912-06-23	1954-06-07	15324 days
7	Johann Gauss	1777-04-30	1855-02-23	45	Mathematician	1777-04-30	1855-02-23	1777-04-30	1855-02-23	28422 days

#to_csv()함수로 시리즈와 데이터프레임을 텍스트 파일(csv, tsv)파일로
#저장할 수 있다.
#기본값이 csv로 설정되어있기 때문에 tsv파일로 저장하러면 sep속성을
#'\t'로 지정해야 한다.
#tab으로 구분되었는지 보기 위해선 엑셀이 아닌 메모장으로 봐야 한다.
names.to_csv('./output/scientists_name_series.csv')
names.to_csv('./output/scientists_name_series.tsv', sep='\t')
scientists.to_csv('./output/scientists_df.csv')
scientists.to_csv('./output/scientists_df.tsv', sep='\t')

Twitter Facebook LinkedIn

Kim-ho-yeon

6_판다스2

공유하기

댓글남기기

참고

리듬킹2(개인 프로젝트)

리듬킹1(개인 프로젝트)

스도쿠(개인 프로젝트)

미로(개인 프로젝트)