空间站广场

论文

Notebooks

比赛

课程

Apps

我的主页

我的Notebooks

我的论文库

我的足迹

我的工作空间

任务

节点

文件

数据集

镜像

项目

数据库

公开

Lab 03 : Content recommendation

Machine Learning

xuxh@dp.tech

更新于 2024-10-15

推荐镜像 :Basic Image:bohrium-notebook:2023-04-07

推荐机型 :c2_m4_cpu

Lecture : Recommendation on Graphs

Lab 03 : Content recommendation

Xavier Bresson

Synthetic dataset

Real-world dataset SWEETRS

Lecture : Recommendation on Graphs

Lab 03 : Content recommendation

Xavier Bresson

代码

文本

[1]

# For Google Colaboratory

import sys, os

if 'google.colab' in sys.modules:

# mount google drive

from google.colab import drive

drive.mount('/content/gdrive')

path_to_file = '/content/gdrive/My Drive/GML2023_codes/codes/05_Recommendation'

print(path_to_file)

# change current path to the folder containing "path_to_file"

os.chdir(path_to_file)

!pwd

代码

文本

[2]

# Load libraries

import numpy as np

import scipy.io

%matplotlib inline

#%matplotlib notebook

from matplotlib import pyplot

import matplotlib.pyplot as plt

plt.rcParams.update({'figure.max_open_warning': 0})

import time

import sys; sys.path.insert(0, 'lib/')

from lib.utils import shrink

from lib.utils import graph_laplacian

import scipy.sparse.linalg

import warnings; warnings.filterwarnings("ignore")

from lib.utils import compute_ncut, reindex_W_with_classes, construct_knn_graph

import torch

import networkx as nx

代码

文本

Synthetic dataset

代码

文本

[3]

# Load graphs of rows/users and columns/movies

mat = scipy.io.loadmat('datasets/synthetic_netflix.mat')

M = mat['M']

Otraining = mat['Otraining']

Otest = mat['Otest']

Wrow = mat['Wrow']

Wcol = mat['Wcol']

n,m = M.shape

print('n,m=',n,m)

Mgt = M # Ground truth

O = Otraining

M = O* Mgt

perc_obs_training = np.sum(Otraining) / (n*m)

print('perc_obs_training=',perc_obs_training)

n,m= 150 200
perc_obs_training= 0.03

代码

文本

[4]

# Viusalize the rating matrix

plt.figure(1)

plt.imshow(Mgt, interpolation='nearest', cmap='jet')

plt.title('Low-rank Matrix M.\nNote: We NEVER observe it\n in real-world applications')

plt.show()

plt.figure(2)

plt.imshow(Otraining*Mgt, interpolation='nearest', cmap='jet')

plt.title('Observed values of M\n for TRAINING.\n Percentage=' + str(perc_obs_training))

plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

代码

文本

[5]

# Content Filtering / Graph Regularization by Dirichlet Energy

#######################################

# Select the set of hyper-parameters

#######################################

# scenario : very low number of ratings, 0.03%, error metric = 161.32

lambdaDir = 1e-1; lambdaDF = 1e3; alpha = 0.02

# Compute Graph Laplacians

Lr = graph_laplacian(Wrow)

Lc = graph_laplacian(Wcol)

I = scipy.sparse.identity(m, dtype=Lr.dtype)

Lr = scipy.sparse.kron( I, Lr )

Lr = scipy.sparse.csr_matrix(Lr)

I = scipy.sparse.identity(n, dtype=Lc.dtype)

Lc = scipy.sparse.kron( Lc, I )

Lc = scipy.sparse.csr_matrix(Lc)

# Pre-processing

L = alpha* Lc + (1.-alpha)* Lr

vecO = np.reshape(O.T,[-1])

vecO = scipy.sparse.diags(vecO, 0, shape=(n*m, n*m) ,dtype=L.dtype)

vecO = scipy.sparse.csr_matrix(vecO)

At = lambdaDir* L + lambdaDF* vecO

vecM = np.reshape(M.T,[-1])

bt = lambdaDF* scipy.sparse.csr_matrix( vecM ).T

bt = np.array(bt.todense()).squeeze()

# Solve by linear system

x,_ = scipy.sparse.linalg.cg(At, bt, x0=bt, tol=1e-9, maxiter=100)

X = np.reshape(x,[m,n]).T

# Reconstruction error

err_test = np.sqrt(np.sum((Otest*(X-Mgt))**2)) / np.sum(Otest) * (n*m)

print('Reconstruction Error: '+ str(round(err_test,5)))

# Plot

plt.figure(2)

plt.imshow(Mgt, interpolation='nearest', cmap='jet')

plt.title('Ground truth low-rank matrix M')

plt.figure(3)

plt.imshow(Otraining*Mgt, interpolation='nearest', cmap='jet')

plt.title('Observed values of M')

plt.figure(4)

plt.imshow(X, interpolation='nearest', cmap='jet')

plt.title('Content Filtering\nReconstruction Error= '+ str(round(err_test,5)))

plt.show()

Reconstruction Error: 161.32328

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

代码

文本

[ ]

代码

文本

Real-world dataset SWEETRS

代码

文本

[6]

# Load graphs of rows/users and columns/products

mat = scipy.io.loadmat('datasets/real_sweetrs_scenario1.mat')

mat = scipy.io.loadmat('datasets/real_sweetrs_scenario2.mat')

# mat = scipy.io.loadmat('datasets/real_sweetrs_scenario3.mat')

M = mat['M']

Otraining = mat['Otraining']

Otest = mat['Otest']

Wrow = mat['Wrow']

Wcol = mat['Wcol']

print('M', M.shape)

print('Otraining', Otraining.shape)

print('Otest', Otest.shape)

print('Wrow', Wrow.shape)

print('Wcol', Wcol.shape)

n,m = M.shape

print('n,m=',n,m)

Mgt = M # Ground truth

O = Otraining

M = O* Mgt

perc_obs_training = np.sum(Otraining)/(n*m)

print('perc_obs_training=',perc_obs_training)

perc_obs_test = np.sum(Otest) / (n*m)

M (664, 77)
Otraining (664, 77)
Otest (664, 77)
Wrow (664, 664)
Wcol (77, 77)
n,m= 664 77
perc_obs_training= 0.1317868878109842

代码

文本

[7]

# Visualize the original rating matrix

plt.figure(1,figsize=(10,10))

plt.imshow(Mgt, interpolation='nearest', cmap='jet', aspect=0.1)

plt.colorbar(shrink=0.65)

plt.title('Original rating matrix\n Percentage observed ratings: ' + str(100*np.sum(Mgt>0)/(n*m))[:5])

# Visualize the observed rating matrix

plt.figure(2, figsize=(10,10))

plt.imshow(Otraining*Mgt, interpolation='nearest', cmap='jet', aspect=0.1)

plt.colorbar(shrink=0.65)

plt.title('Observed rating matrix\n Percentage observed ratings: ' + str(100*perc_obs_training)[:5])

plt.show()

<Figure size 1000x1000 with 2 Axes>

<Figure size 1000x1000 with 2 Axes>

代码

文本

[8]

# Visualize graph of users and graph of products

# Plot adjacency matrix w.r.t. NCut communities

# plot graph of users

W = Wrow

nc = 10; Cncut, _ = compute_ncut(W, np.zeros(Mgt.shape[0]), nc)# compute NCut clustering

[reindexed_W_ncut,reindexed_C_ncut] = reindex_W_with_classes(W,Cncut)

plt.figure(1)

plt.spy(reindexed_W_ncut, precision=0.01, markersize=1)

plt.title('Adjacency matrix of users indexed \naccording to the NCut communities')

plt.show()

A = W.copy()

A.setdiag(0)

A.eliminate_zeros()

G_nx = nx.from_scipy_sparse_array(A)

plt.figure(2,figsize=[30,30])

nx.draw_networkx(G_nx, with_labels=True, node_color=np.array(Cncut), cmap='jet')

# plot graph of products

W = Wcol

nc = 10; Cncut, _ = compute_ncut(W, np.zeros(Mgt.shape[1]), nc)# compute NCut clustering

[reindexed_W_ncut,reindexed_C_ncut] = reindex_W_with_classes(W,Cncut)

plt.figure(3)

plt.spy(reindexed_W_ncut, precision=0.01, markersize=1)

plt.title('Adjacency matrix of products indexed \naccording to the NCut communities')

plt.show()

A = W.copy()

A.setdiag(0)

A.eliminate_zeros()

G_nx = nx.from_scipy_sparse_array(A)

plt.figure(4,figsize=[30,30])

nx.draw_networkx(G_nx, with_labels=True, node_color=np.array(Cncut), cmap='jet')

<Figure size 640x480 with 1 Axes>

<Figure size 3000x3000 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 3000x3000 with 1 Axes>

代码

文本

[9]

# Content Filtering / Graph Regularization by Dirichlet Energy

#######################################

# Select the set of hyper-parameters

#######################################

# scenario 1 : low number of ratings, e.g. 1.3%, error metric = 399.89

lambdaDir = 1e-1; lambdaDF = 1e3; alpha = 0.02

# scenario 2 : intermediate number of ratings, e.g. 13.1%, error metric = 411.24

lambdaDir = 1e-1; lambdaDF = 1e3; alpha = 0.02

# scenario 3 : large number of ratings, e.g. 52.7%, error metric = 748.52

# lambdaDir = 1e-1; lambdaDF = 1e3; alpha = 0.02

# Compute Graph Laplacians

Lr = graph_laplacian(Wrow)

Lc = graph_laplacian(Wcol)

I = scipy.sparse.identity(m, dtype=Lr.dtype)

Lr = scipy.sparse.kron( I, Lr )

Lr = scipy.sparse.csr_matrix(Lr)

I = scipy.sparse.identity(n, dtype=Lc.dtype)

Lc = scipy.sparse.kron( Lc, I )

Lc = scipy.sparse.csr_matrix(Lc)

# Pre-processing

L = alpha* Lc + (1.-alpha)* Lr

vecO = np.reshape(O.T,[-1])

vecO = scipy.sparse.diags(vecO, 0, shape=(n*m, n*m) ,dtype=L.dtype)

vecO = scipy.sparse.csr_matrix(vecO)

At = lambdaDir* L + lambdaDF* vecO

vecM = np.reshape(M.T,[-1])

bt = lambdaDF* scipy.sparse.csr_matrix( vecM ).T

bt = np.array(bt.todense()).squeeze()

# Solve by linear system

x,_ = scipy.sparse.linalg.cg(At, bt, x0=bt, tol=1e-9, maxiter=100)

X = np.reshape(x,[m,n]).T

# Reconstruction error

err_test = np.sqrt(np.sum((Otest*(X-Mgt))**2)) / np.sum(Otest) * (n*m)

print('Reconstruction Error: '+ str(round(err_test,5)))

# Plots

plt.figure(2, figsize=(10,10))

plt.imshow(Mgt, interpolation='nearest', cmap='jet', aspect=0.1)

plt.colorbar(shrink=0.65)

plt.title('Original rating matrix\n Percentage observed ratings: ' + str(100*np.sum(Mgt>0)/(n*m))[:5])

plt.show()

plt.figure(3, figsize=(10,10))

plt.imshow(Otraining*Mgt, interpolation='nearest', cmap='jet', aspect=0.1)

plt.colorbar(shrink=0.65)

plt.title('Observed rating matrix\n Percentage observed ratings: ' + str(100*perc_obs_training)[:5])

plt.show()

plt.figure(4, figsize=(10,10))

plt.imshow(X, interpolation='nearest', cmap='jet', aspect=0.1)

plt.colorbar(shrink=0.65)

plt.title('Content Filtering\nReconstruction Error= '+ str(round(err_test,5)))

plt.show()

Reconstruction Error: 411.24489

<Figure size 1000x1000 with 2 Axes>

<Figure size 1000x1000 with 2 Axes>

<Figure size 1000x1000 with 2 Axes>

代码

文本

[ ]

代码

文本

[ ]

代码

文本

Machine Learning

点个赞吧

本文被以下合集收录

Graph Machine learning

xuxh@dp.tech

更新于 2024-10-08

44 篇0 人关注