05 ten_years_stock

10 Years Korea Stock Price

  • KOSPI 200 종목을 대상으로 10년 동안 수익률에 대해 시각화 하였음
  • 200종목을 11개의 산업으로 분류 하였음(gics 기준)
  • 주가는 Yahoo Finance에서 다운을 받아 수정 종가가 아닌 일반 종가를 사용하여 수익률을 계산하였음
In [1]:
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
import pandas as  pd
from pandas_datareader import data, wb  
from datetime import datetime
import seaborn as sns
import squarify

import matplotlib.font_manager as fm
import matplotlib
font_location = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

from pylab import rcParams
rcParams['figure.figsize'] = 16, 10

from pandas_datareader import data as pdr
import fix_yahoo_finance as yf
import numpy as np
/home/bono/anaconda3/lib/python3.5/site-packages/fix_yahoo_finance/__init__.py:43: DeprecationWarning: 
    Auto-overriding of pandas_datareader's get_data_yahoo() is deprecated and no longer available.
    Use pdr_override() to explicitly override it.
  DeprecationWarning)
In [3]:
df_prices = pd.read_csv("yahoo_prices.csv")
startday = datetime(2007, 1, 1)
endday = datetime(2017, 9, 18)
# download dataframe
df_kospi = pdr.get_data_yahoo('^KS11', start=startday, end=endday)
#gics = pd.read_excel("gics.xlsx", converters={'상장주식수(주)':float})
gics = pd.read_excel("gics.xlsx", converters={'scode':str})
gics = gics.rename(columns={'업종': 'sector','종목명': 'asset','거래대금(원)':'tr_vol','상장시가총액(원)':'mar_cap'})
gics = gics[['sector', 'scode', 'asset','mar_cap', 'tr_vol' ]]
gics['sector'][gics['sector'].str.contains('정보')] = '정보기술'
gics.head(10)
Out[3]:
sectorscodeassetmar_captr_vol
0소재006840AK홀딩스8955351236001646716200
1소재155660DSR87040000000441700520
2소재037370EG7575000000011451526990
3소재032860GMR 머티리얼즈37745233280162892924
4소재036640HRS71634024000275362300
5소재036670KCI7674870000039083330
6소재109070KGP267206139204758607500
7소재001390KG케미칼2361435435003483761950
8소재001940KISCO홀딩스269016092800106181500
9소재083420KPX그린케미칼9240000000021469310
In [4]:
#gics.rename(columns={'업종':'sector','종목명':'asset'},inplace=True)

df_prices.rename(columns={'Date':'date','sname':'asset','Cl':'close'},inplace=True)
df_prices = df_prices.merge(gics[['sector','asset','mar_cap']], left_on='asset', right_on='asset' )
df_prices = df_prices[['date','sector','asset','close','mar_cap']]
In [5]:
df_170901 = df_prices[df_prices['date'] == '2017-09-01']

WordCloud를 사용하여 Kospi200 종목들을 표시함

In [6]:
from wordcloud import WordCloud
df_170901['asset'].values
dlist = (df_170901['asset'].values)
#wordcloud = WordCloud(font_path= "/usr/share/fonts/truetype/nanum/NanumGothic.ttf" ).generate(str(dlist))
wordcloud = WordCloud(font_path= "/usr/share/fonts/truetype/nanum/NanumGothic.ttf", relative_scaling = 0.2, 
                    min_font_size = 5, max_font_size = 20 ).generate(str(dlist))

fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)
/home/bono/anaconda3/lib/python3.5/site-packages/wordcloud/wordcloud.py:32: ResourceWarning: unclosed file <_io.TextIOWrapper name='/home/bono/anaconda3/lib/python3.5/site-packages/wordcloud/stopwords' mode='r' encoding='UTF-8'>
  os.path.join(os.path.dirname(__file__), 'stopwords')).read().split('\n')])
In [7]:
grby1 = df_170901.groupby('sector').count().reset_index()
In [8]:
grby1  = grby1.sort_values('date',ascending=True)
df_170901.head()
Out[8]:
datesectorassetclosemar_cap
26542017-09-01금융우리은행18550.012201800000000
53202017-09-01자유소비재경방14700.0371476908500
79862017-09-01필수소비재삼양홀딩스105500.0829877859900
99492017-09-01필수소비재하이트진로24850.01767366997200
126152017-09-01산업재대림산업85400.02832720000000

산업별 분류

In [9]:
height = grby1['date']
bars = grby1['sector']
y_pos = np.arange(len(bars))

plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.show()
#https://python-graph-gallery.com/1-basic-barplot/

시가총액별 TreeMap

In [10]:
squarify.plot(sizes=df_170901['mar_cap'], label=df_170901['asset'], alpha=.8,  ) 
plt.axis('off')
Out[10]:
(0.0, 100.0, 0.0, 100.0)
In [11]:
df_prices.head()
Out[11]:
datesectorassetclosemar_cap
02007-01-02금융우리은행21486.90039112201800000000
12007-01-03금융우리은행21039.30078112201800000000
22007-01-04금융우리은행20591.59960912201800000000
32007-01-05금융우리은행20342.90039112201800000000
42007-01-08금융우리은행19994.80078112201800000000

df_prices(Kospi200 종목들 데이터)와 df_kospi를 merge

In [12]:
#df_prices를 피봇 처리
df_prices= df_prices.pivot_table(values= 'close', index = 'date', columns= ['sector','asset'])
In [13]:
df_prices.head()
Out[13]:
sector건강관리...필수소비재
assetJW중외제약JW홀딩스광동제약녹십자녹십자홀딩스대웅제약동아쏘시오홀딩스동아에스티보령제약부광약품...아모레퍼시픽에이블씨엔씨오뚜기오리온이마트코스맥스팜스코하이트진로한국콜마현대그린푸드
date
2007-01-0217956.699219NaN3020.052449.8007815954.54980549160.30078158425.699219NaN15046.5996099416.349609...56300.0NaN100500.012771.900391NaNNaN1990.0NaNNaN7300.0
2007-01-0317210.900391NaN3030.051072.0000005927.27002048825.30078157111.898438NaN14940.0996099227.480469...53800.0NaN98000.012452.599609NaNNaN1965.0NaNNaN7010.0
2007-01-0417466.000000NaN2970.051255.6992195918.18017648909.10156356143.898438NaN14922.4003919308.419922...52500.0NaN95900.012231.599609NaNNaN1895.0NaNNaN6830.0
2007-01-0517073.500000NaN2960.049694.1015635909.08984448574.10156354899.398438NaN14886.9003919443.330078...52500.0NaN94000.012354.400391NaNNaN1930.0NaNNaN6900.0
2007-01-0816582.900391NaN2970.049326.6992195909.08984447150.39843853931.398438NaN14940.0996099227.480469...53400.0NaN94000.012477.200195NaNNaN1930.0NaNNaN6780.0

5 rows × 200 columns

In [14]:
df_kospi.rename(columns={'Date':'date','Close':'kospi'},inplace=True)
df_kospi = df_kospi[['kospi']]
df_kospi.columns = pd.MultiIndex.from_tuples([('index','kospi')])
df_pr = df_kospi.merge(df_prices, left_index = True, right_index = True)
In [15]:
df_pr.head(5)
Out[15]:
index건강관리...필수소비재
kospiJW중외제약JW홀딩스광동제약녹십자녹십자홀딩스대웅제약동아쏘시오홀딩스동아에스티보령제약...아모레퍼시픽에이블씨엔씨오뚜기오리온이마트코스맥스팜스코하이트진로한국콜마현대그린푸드
2007-01-021435.26001017956.699219NaN3020.052449.8007815954.54980549160.30078158425.699219NaN15046.599609...56300.0NaN100500.012771.900391NaNNaN1990.0NaNNaN7300.0
2007-01-031409.34997617210.900391NaN3030.051072.0000005927.27002048825.30078157111.898438NaN14940.099609...53800.0NaN98000.012452.599609NaNNaN1965.0NaNNaN7010.0
2007-01-041397.29003917466.000000NaN2970.051255.6992195918.18017648909.10156356143.898438NaN14922.400391...52500.0NaN95900.012231.599609NaNNaN1895.0NaNNaN6830.0
2007-01-051385.76001017073.500000NaN2960.049694.1015635909.08984448574.10156354899.398438NaN14886.900391...52500.0NaN94000.012354.400391NaNNaN1930.0NaNNaN6900.0
2007-01-081370.81005916582.900391NaN2970.049326.6992195909.08984447150.39843853931.398438NaN14940.099609...53400.0NaN94000.012477.200195NaNNaN1930.0NaNNaN6780.0

5 rows × 201 columns

월별 수익률 계산

In [16]:
#  일별 시계열을 월별로 resample 함
df_prm = df_pr.resample('BM').apply(lambda x:x[-1])
In [17]:
df_prm.head(5)
Out[17]:
index건강관리...필수소비재
kospiJW중외제약JW홀딩스광동제약녹십자녹십자홀딩스대웅제약동아쏘시오홀딩스동아에스티보령제약...아모레퍼시픽에이블씨엔씨오뚜기오리온이마트코스맥스팜스코하이트진로한국콜마현대그린푸드
2007-01-311360.22998015503.599609NaN2705.048408.1015636072.72998045140.39843849782.800781NaN13840.000000...56200.0NaN86000.012084.200195NaNNaN1830.0NaNNaN6540.0
2007-02-281417.33996616013.799805NaN2910.049326.6992196272.72998049495.300781NaNNaN12828.599609...55400.0NaN86200.011273.700195NaNNaN1775.0NaNNaN6340.0
2007-03-301452.55004916092.299805NaN3305.047122.1992196227.27002053180.30078150474.199219NaN13840.000000...51800.0NaN88200.011445.599609NaNNaN2005.0NaNNaN6270.0
2007-04-301542.23999015464.299805NaN3095.051990.5000006545.45019544637.89843845841.699219NaN14620.799805...60200.0NaN109500.012501.799805NaNNaN2000.0NaNNaN9250.0
2007-05-311700.91003419232.300781NaN3850.060533.1015637227.27002049997.80078154346.199219NaN15543.400391...78000.0NaN126000.012919.299805NaNNaN2300.0NaNNaN12350.0

5 rows × 201 columns

In [18]:
# 중간에 수익율이 비어 있으면 보간
df_prm =df_prm.interpolate()
# 전월에 수익률이 없으면 다음월의 수익률로 대체 
df_prm= df_prm.fillna(method='bfill')
# 수익률 계산
df_m_ret = df_prm.pct_change()
# 첫월 수익률 NA이므로 제거
df_m_ret = df_m_ret.dropna()
In [19]:
# 누적수익률 계산
df_m_cumret = (df_m_ret + 1).cumprod() -1 
In [20]:
df_m_cumret.head(5)
Out[20]:
index건강관리...필수소비재
kospiJW중외제약JW홀딩스광동제약녹십자녹십자홀딩스대웅제약동아쏘시오홀딩스동아에스티보령제약...아모레퍼시픽에이블씨엔씨오뚜기오리온이마트코스맥스팜스코하이트진로한국콜마현대그린푸드
2007-02-280.0419860.0329080.00.0757860.0189760.0329340.0964750.0069440.0-0.073078...-0.0142350.00.002326-0.0670710.00.0-0.0300550.00.0-0.030581
2007-03-300.0678710.0379720.00.221811-0.0265640.0254480.1781090.0138880.00.000000...-0.0782920.00.025581-0.0528460.00.00.0956280.00.0-0.041284
2007-04-300.133808-0.0025350.00.1441770.0740040.077843-0.011132-0.0791660.00.056416...0.0711740.00.2732560.0345570.00.00.0928960.00.00.414373
2007-05-310.2504580.2405060.00.4232900.2504750.1901190.1076070.0916660.00.123078...0.3879000.00.4651160.0691070.00.00.2568310.00.00.888379
2007-06-290.2818420.3650090.00.4898340.4041760.3547890.2272730.2854170.00.308504...0.3024910.00.4534880.1219530.00.00.1229510.00.00.605505

5 rows × 201 columns

업종 평균을 계산후 그래프

In [21]:
#
#df_m_cumret.plot()
df_m_cumret.groupby(level=[0], axis =1).mean().plot()
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcfaaa4c128>
In [22]:
df_m_cumret.groupby(level=[0], axis =1).mean().tail(5)
Out[22]:
index건강관리금융산업재소재에너지유틸리티자유소비재정보기술통신서비스필수소비재
2017-05-310.7257232.8469870.3342860.7856632.4342082.0726750.2072372.4851501.215918-0.0659313.776401
2017-06-300.7583722.7026230.4092830.7093012.4672521.8903640.2180222.3194401.397933-0.0541063.368240
2017-07-310.7664002.5662010.5058210.6952752.5811162.1029380.2207742.2942981.319207-0.0002843.159197
2017-08-310.7373462.5517410.4311970.6382422.6487802.0749620.1334742.2017781.435864-0.1144253.062538
2017-09-290.7777952.4824360.4160490.6116512.6727351.9913070.0906682.1168701.629204-0.1382032.684909
  • 그래프와 표를 통해서 지난 10년동안 업종별로 건강관리, 필수소비재, 자유소비재, 에너지 정보, 소재 등이 높은 수익률을 보임.

섹터별 종목별 수익률 그래프

In [23]:
fig, axes = plt.subplots(4,3, figsize = (18,50))

grby2 = df_m_cumret.groupby(level=[0], axis = 1)
for i, (name, group) in enumerate(grby2):
    group.plot(ax = axes[i // 3][i % 3])

상위 15종목 월간 Heatmap

In [24]:
df_15 = df_m_cumret.sort_values('2017-09-29 00:00:00', ascending=0, axis = 1)
In [25]:
df_15 = df_15.iloc[:,:15]
df_15.columns = df_15.columns.droplevel()
In [26]:
df_15.columns.values
Out[26]:
array(['한샘', '한국타이어월드와이드', 'SPC삼립', '에이블씨엔씨', 'AK홀딩스', '한솔케미칼', 'CJ',
       'LG화학', '한미사이언스', '오뚜기', '제일파마홀딩스', '아모레G', '엔씨소프트', 'LG생활건강',
       '넥센타이어'], dtype=object)
In [27]:
df_m_15 = df_m_ret
df_m_15.columns = df_m_15.columns.droplevel()
In [28]:
df_m_15 = df_m_15[df_15.columns.values]
In [29]:
#df15 = df15.iloc[:,:15]
#df_m_15.columns = df_m_15.columns.droplevel()
df_m_15.columns.name = 'asset'
df_m_15.index.name = 'date'
In [30]:
import plt_m_heat
plt_m_heat.plot_monthly_ic_heatmap(df_m_15)
plt.show()

년도별 상위 10 종목

In [31]:
# 월별 수익률을 복리로 처리하는 것이 맞지만 계산의 편의를 위한 단순합함
df_m_ret.groupby(df_m_ret.index.year).sum()
Out[31]:
kospiJW중외제약JW홀딩스광동제약녹십자녹십자홀딩스대웅제약동아쏘시오홀딩스동아에스티보령제약...아모레퍼시픽에이블씨엔씨오뚜기오리온이마트코스맥스팜스코하이트진로한국콜마현대그린푸드
date
20070.3526110.110864-0.2480170.5735190.6022590.3647720.5988490.4225090.0000000.250920...0.3287240.0000000.5692320.0573320.0000000.0000000.0513570.0000000.0000000.532617
2008-0.465137-0.669974-0.905024-0.3341210.058210-0.352819-0.4081240.0538880.000000-0.744674...-0.0286070.0709230.095940-0.2120400.0000000.000000-0.2156630.0000000.000000-0.307858
20090.4375450.8938170.7844610.1767250.4448830.4259330.0121240.3833360.0000000.805876...0.4062152.4128180.0975060.5672280.0000000.0000000.723673-0.0885710.0000000.443642
20100.209841-0.0544410.4122110.1568430.1875320.3179650.025854-0.0011890.0000000.452043...0.2416320.686354-0.1133820.3290030.0000000.0000000.210615-0.1023320.0000000.362564
2011-0.0963290.1352950.1084420.1289910.0815720.428808-0.399130-0.2599230.000000-0.257736...-0.0452200.2187710.2188320.6042650.1520210.0000000.645478-0.2631680.0000000.399118
20120.0983610.0921790.0344160.530930-0.0157300.0807420.6568810.2476520.0000000.480906...0.1738471.3073820.3952510.510014-0.1401400.0000000.0923270.265495-0.0152910.066591
20130.0120830.2534100.1545230.268752-0.016487-0.1774210.5703200.734049-0.3396960.478674...-0.173476-0.6777800.790499-0.1116600.1390980.0000000.949751-0.2903140.014109-0.005679
2014-0.045805-0.0512650.0197070.3774690.1083570.598052-0.094252-0.149440-0.0333940.297083...0.865662-0.0073560.2438490.103179-0.2475150.4350170.4853820.0847660.5592200.156352
20150.0274901.4523291.8651170.1311470.3794210.8477440.2686460.4514060.5671720.448240...0.6767270.2761161.0029200.220711-0.0456450.6902650.2452320.0131730.8655970.302906
20160.0346460.6672280.275045-0.243803-0.087004-0.4893700.0816160.015437-0.2517470.213116...-0.2328000.049627-0.531977-0.521620-0.003523-0.256902-0.250692-0.061262-0.296037-0.452852
20170.180327-0.1526690.002391-0.0081880.1793460.4415590.439802-0.235396-0.251354-0.437211...-0.210657-0.1812590.1368700.7076430.212085-0.040110-0.0297250.182166-0.0298760.050472

11 rows × 201 columns

In [32]:
grby5= df_m_ret.groupby(df_m_ret.index.year)
In [33]:
fig, axes = plt.subplots(4,3, figsize = (18,50))
import matplotlib.cm as cm
for i, (name, group) in enumerate(grby5):
    srz = group.sum()
    srz[srz.rank(pct=True)>0.95].sort_values(ascending =False).plot(kind= 'barh' ,cmap=cm.RdYlGn,  
                                                                    ax = axes[i // 3][i % 3], title  = (" Return {} ".format(name)))


© 2017. All rights reserved.

Powered by ZooFighter v0.12