본문 바로가기

ML/tensorflow

1. word2Vec

#!/usr/bin/env python

# coding: utf-8


# In[1]:



corpus = ['king is a strong man', 

          'queen is a wise woman', 

          'boy is a young man',

          'girl is a young woman',

          'prince is a young king',

          'princess is a young queen',

          'man is strong', 

          'woman is pretty',

          'prince is a boy will be king',

          'princess is a girl will be queen']



# In[2]:



print corpus



# In[7]:



def remove_stop_words(corpus):

    stop_words = ['is', 'a', 'will', 'be']

    results = []

    for text in corpus:

        tmp = text.split(' ')

        for stop_word in stop_words:

            if stop_word in tmp:

                tmp.remove(stop_word)

        results.append(" ".join(tmp))

    return results



# In[8]:



corpus = remove_stop_words(corpus)

print corpus



# In[10]:



words = []

for text in corpus:

    for word in text.split(' '):

        words.append(word)

        

words = set(words)


words



# In[12]:



word2int = {}


for i,word in enumerate(words):

    word2int[word] = i

    

sentences = []

for sentence in corpus:

    sentences.append(sentence.split())

    

WINDOW_SIZE = 2


data = []

for sentence in sentences:

    for idx, word in enumerate(sentence):

        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1]:

            if neighbor != word:

                data.append([word, neighbor])

                



# In[14]:



import pandas as pd

for text in corpus:

    print(text)

    

df = pd.DataFrame(data, columns = ['input', 'label'])



# In[15]:



df.head(10)



# In[16]:



df.shape



# In[17]:



word2int



# In[19]:



import tensorflow as tf

import numpy as np


ONE_HOT_DIM = len(words)


def to_one_hot_encoding(data_point_index):

    one_hot_encoding = np.zeros(ONE_HOT_DIM)

    one_hot_encoding[data_point_index] = 1

    return one_hot_encoding


X = []

Y = []


for x,y in zip(df['input'], df['label']):

    X.append(to_one_hot_encoding(word2int[x]))

    Y.append(to_one_hot_encoding(word2int[y]))

    

X_train = np.asarray(X)

Y_train = np.asarray(Y)


x = tf.placeholder(tf.float32, shape = (None, ONE_HOT_DIM))

y_label = tf.placeholder(tf.float32, shape = (None, ONE_HOT_DIM))



EMBEDDING_DIM = 2 


# hidden layer: which represents word vector eventually

W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))

b1 = tf.Variable(tf.random_normal([1])) #bias

hidden_layer = tf.add(tf.matmul(x,W1), b1)


# output layer

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))

b2 = tf.Variable(tf.random_normal([1]))

prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))


# loss function: cross entropy

loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))


# training operation

train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)



# In[20]:



sess = tf.Session()

init = tf.global_variables_initializer()

sess.run(init) 


iteration = 20000

for i in range(iteration):

    # input is X_train which is one hot encoded word

    # label is Y_train which is one hot encoded neighbor word

    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})

    if i % 3000 == 0:

        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))



# In[21]:



vectors = sess.run(W1 + b1)

print(vectors)



# In[22]:



w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2'])

w2v_df['word'] = words

w2v_df = w2v_df[['word', 'x1', 'x2']]

w2v_df



# In[25]:



import matplotlib.pyplot as plt


fig, ax = plt.subplots()


for word, x1, x2 in zip(w2v_df['word'], w2v_df['x1'], w2v_df['x2']):

    ax.annotate(word, (x1,x2 ))

    

PADDING = 1.0

x_axis_min = np.amin(vectors, axis=0)[0] - PADDING

y_axis_min = np.amin(vectors, axis=0)[1] - PADDING

x_axis_max = np.amax(vectors, axis=0)[0] + PADDING

y_axis_max = np.amax(vectors, axis=0)[1] + PADDING

 

plt.xlim(x_axis_min,x_axis_max)

plt.ylim(y_axis_min,y_axis_max)

plt.rcParams["figure.figsize"] = (10,10)


plt.show()



# In[24]:






# In[ ]:





'ML > tensorflow' 카테고리의 다른 글

이력서 자동 생성 - 로직  (0) 2021.10.05