# See : https://medium.com/media/97d3727da2a20f72c32cacac5c92a7e4/href import numpy as np import pandas as pd import tensorflow as tf import tensorflow_hub as hub from sklearn.preprocessing import MultiLabelBinarizer
# See : https://medium.com/media/8883aaba5a568e7d8780225c18bef024/href !wget 'https://storage.googleapis.com/movies_data/movies_metadata.csv' data = pd.read_csv('movies_metadata.csv') descriptions = data['overview'] genres = data['genres']
# See : https://medium.com/media/1aeec70771ffccac98585c5e98dc0688/href top_genres = ['Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction']
# See : https://medium.com/media/6e14ba2cef44f6931398cf2f28a4c495/href train_size = int(len(descriptions) * .8) train_descriptions = descriptions[:train_size] train_genres = genres[:train_size] test_descriptions = descriptions[train_size:] test_genres = genres[train_size:]
# See : https://medium.com/media/83f9a1bdb20f28c576f27a5947e0f151/href description_embeddings = hub.text_embedding_column( "movie_descriptions", module_spec="https://tfhub.dev/google/universal-sentence-encoder/2" )
# See : https://medium.com/media/eec3cb4490796fe96bc0d60cee441fd3/href # Genre lookup, each genre corresponds to an index top_genres = ['Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction'] # Multi-hot label for an action and adventure movie [0 0 0 1 0 0 0 1 0]
# See : https://medium.com/media/5a1bb0c1a4933ec65c07a37411dabd7b/href encoder = MultiLabelBinarizer() encoder.fit_transform(train_genres) train_encoded = encoder.transform(train_genres) test_encoded = encoder.transform(test_genres) num_classes = len(encoder.classes_)
# See : https://medium.com/media/ffc095999d9031dae32f474ee7d1872d/href multi_label_head = tf.contrib.estimator.multi_label_head( num_classes, loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE )
# See : https://medium.com/media/9f28242b0d6c8f181b18ebdd01ca5ddc/href estimator = tf.contrib.estimator.DNNEstimator( head=multi_label_head, hidden_units=[64,10], feature_columns=[description_embeddings] )
# See : https://medium.com/media/7bb7749e029a7f9eaadfe2e9cb524151/href # Format our data for the numpy_input_fn features = { "descriptions": np.array(train_descriptions) } labels = np.array(train_encoded) train_input_fn = tf.estimator.inputs.numpy_input_fn( features, labels, shuffle=True, batch_size=32, num_epochs=20 )
# See : https://medium.com/media/62e6792db3617a345627f3e78b1e29b1/href estimator.train(input_fn=train_input_fn)
# See : https://medium.com/media/3dde4159984bbef42446021035801138/href eval_input_fn = tf.estimator.inputs.numpy_input_fn({"descriptions": np.array(test_descriptions).astype(np.str)}, test_encoded.astype(np.int32), shuffle=False) estimator.evaluate(input_fn=eval_input_fn)
# See : https://medium.com/media/500321f5f6a849e2a155cfbe6fcfda80/href raw_test = [ "An examination of our dietary choices and the food we put in our bodies. Based on Jonathan Safran Foer's memoir.", # Documentary "A teenager tries to survive the last week of her disastrous eighth-grade year before leaving to start high school.", # Comedy "Ethan Hunt and his IMF team, along with some familiar allies, race against time after a mission gone wrong." # Action, Adventure ]
# See : https://medium.com/media/5368313e3e0d7016ed08ba986ee13975/href predict_input_fn = tf.estimator.inputs.numpy_input_fn({"descriptions": np.array(raw_test).astype(np.str)}, shuffle=False) results = estimator.predict(predict_input_fn)
# See : https://medium.com/media/8c54a017ecdaca6aad0fc034d32dd575/href for movie_genres in results: top_2 = movie_genres['probabilities'].argsort()[-2:][::-1] for genre in top_2: text_genre = encoder.classes_[genre] print(text_genre + ': ' + str(round(movie_genres['probabilities'][genre] * 100, 2)) + '%')