04 stock_clustring

26 Oct 2017 | 24 Minute Read on Visualization

%matplotlib inline¶

In [1]:

import matplotlib.font_manager as fm
import matplotlib

Visualizing the stock market structure¶

In [3]:

font_location = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [5]:

prices = pd.read_csv("stock170920.csv")
prices.rename(columns={'Date':'date','sname':'asset','Cl':'close','Op':'open'},inplace=True)

In [6]:

g = prices.groupby('asset')
names = g.filter(lambda x: x['close'].count() >= 2667)['asset'].unique()

In [7]:

symbols = g.filter(lambda x: x['close'].count() >= 2667)['scode'].unique()

In [8]:

close_prices = prices[prices['asset'].isin(names)][['date','close','asset']].pivot_table(values  = 'close', index ='date', columns  = 'asset')
open_prices = prices[prices['asset'].isin(names)][['date','open','asset']].pivot_table(values  = 'open', index ='date', columns  = 'asset')

step1¶

ㅡ

In [10]:

# variation = close_prices['2016-01-01':'2016-03-01'].pct_change().dropna()
close_prices_0 = close_prices.iloc[600:699,].T
close_prices_1 = close_prices.iloc[601:700,].T


variation = close_prices.iloc[600:699,].pct_change()

In [11]:

variation = variation.dropna()
variation = variation.T

In [ ]:

In [13]:

######################################################################
# Learn a graphical structure from the correlations
edge_model = covariance.GraphLassoCV()

# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
#np.nan_to_num(X)
#X.nan_to_num(0, copy=True)

In [14]:

edge_model.fit(X)

# #############################################################################
# Cluster using affinity propagation

_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()

for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

# #############################################################################
# Find a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)

embedding = node_position_model.fit_transform(X.T).T

/home/bono/anaconda3/lib/python3.5/site-packages/numpy/linalg/linalg.py:1741: RuntimeWarning: invalid value encountered in slogdet
  sign, logdet = _umath_linalg.slogdet(a, signature=signature)

Cluster 1: SKC, SK네트웍스, 한샘, 한일시멘트
Cluster 2: LG디스플레이, SK하이닉스, 동부하이텍, 삼성전자, 효성
Cluster 3: CJ, KT, POSCO, S-Oil, 기업은행, 삼성화재, 신도리코, 신한지주, 쌍용양회, 오뚜기, 우리은행, 한국전력
Cluster 4: GS건설, NH투자증권, OCI, 대림산업, 동부화재, 두산중공업, 한국금융지주, 현대건설, 현대산업
Cluster 5: LG전자, LG화학, 삼성SDI, 삼성전기, 한솔테크닉스, 현대모비스, 현대차
Cluster 6: 롯데케미칼, 아모레G, 한화케미칼
Cluster 7: LS산전, SK케미칼, 한화테크윈
Cluster 8: KCC, 고려아연, 남해화학, 삼성중공업, 한국가스공사, 현대미포조선, 현대제철, 현대중공업

In [15]:

# #############################################################################
# Visualization
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')

# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.spectral)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):

    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, size=10,
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.spectral(label / float(n_labels)),
                       alpha=.6))

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()