{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "NNamP65y8eGf" }, "outputs": [], "source": [ "from sklearn import datasets\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "from sklearn.decomposition import PCA, KernelPCA\n", "from sklearn.datasets import make_circles\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import NMF\n", "from sklearn.decomposition import TruncatedSVD\n", "from scipy.sparse import csr_matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fvJfKhFq8hQc", "outputId": "acbc4c59-acbd-4ff4-bacb-e54b55e0312f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original number of features: 64\n", "Reduced number of features: 40\n" ] } ], "source": [ "# Load the data\n", "digits = datasets.load_digits()\n", "# Feature matrix standardization\n", "features = StandardScaler().fit_transform(digits.data)\n", "# Perform PCA While retaining 80% of variance\n", "pca = PCA(n_components=0.95, whiten=True)\n", "# perform PCA\n", "pcafeatures = pca.fit_transform(features)\n", "# Display results\n", "print(\"Original number of features:\", features.shape[1])\n", "print(\"Reduced number of features:\", pcafeatures.shape[1])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jyU800Lf8it4", "outputId": "0d4c73bf-7d08-48e6-a44f-a5647a2e0c11" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original number of features: 2\n", "Reduced number of features: 1\n" ] } ], "source": [ "# Creation of the linearly inseparable data\n", "features, _ = make_circles(n_samples=2000, random_state=1, noise=0.1, factor=0.1)\n", "# kernal PCA with radius basis function (RBF) kernel application\n", "k_pca = KernelPCA(kernel=\"rbf\", gamma=16, n_components=1)\n", "k_pcaf = k_pca.fit_transform(features)\n", "print(\"Original number of features:\", features.shape[1])\n", "print(\"Reduced number of features:\", k_pcaf.shape[1])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IfCo5TA28kn6", "outputId": "312956a9-9fb5-4296-d766-a3e642649da1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number of features(original): 4\n", "number of features that was reduced: 1\n" ] } ], "source": [ "#flower dataset loading:\n", "iris = datasets.load_iris()\n", "features = iris.data\n", "target = iris.target\n", "# Creation of LDA. Use of LDA for features transformation\n", "lda = LinearDiscriminantAnalysis(n_components=1)\n", "features_lda = lda.fit(features, target).transform(features)\n", "# Print the number of features\n", "print(\"number of features(original):\", features.shape[1])\n", "print(\"number of features that was reduced:\", features_lda.shape[1])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yjQBlMtM8mQu", "outputId": "800279fb-f44b-43e8-9210-a35b8e190fc7" }, "outputs": [ { "data": { "text/plain": [ "array([0.9912126])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda.explained_variance_ratio_" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tHOWTxn18nf7", "outputId": "ae3c857a-0ca8-4508-affc-b5ea4dff6788" }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load Iris flower dataset:\n", "iris123 = datasets.load_iris()\n", "features = iris123.data\n", "target = iris123.target\n", "# Create and run LDA\n", "lda_r = LinearDiscriminantAnalysis(n_components=None)\n", "features_lda = lda_r.fit(features, target)\n", "# array of explained variance ratios\n", "lda_var_r = lda_r.explained_variance_ratio_\n", "# function ceration\n", "def select_n_c(v_ratio, g_var: float) -> int:\n", " # initial variance explained setting\n", " total_v = 0.0\n", " # number of features initialisation\n", " n_components = 0\n", " # If we consider explained variance of each feature:\n", " for explained_v in v_ratio:\n", " # explained variance addition to the total\n", " total_v += explained_v\n", " # add one to number of components\n", " n_components += 1\n", " # we attain our goal level of explained variance\n", " if total_v >= g_var:\n", " # end the loop\n", " break\n", " # return the number of components\n", " return n_components\n", "\n", "# run the function\n", "select_n_c(lda_var_r, 0.95)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "12zwY1Du8o6i", "outputId": "e9178fdf-2195-41cc-f4c3-a1e52c030df5" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.7/dist-packages/sklearn/decomposition/_nmf.py:294: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).\n", " FutureWarning,\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Original number of features: 64\n", "Reduced number of features: 12\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.7/dist-packages/sklearn/decomposition/_nmf.py:1641: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.\n", " ConvergenceWarning,\n" ] } ], "source": [ "# data loading\n", "digit = datasets.load_digits()\n", "# feature matrix loading\n", "feature_m = digit.data\n", "# Creation, fit and application of NMF\n", "n_mf = NMF(n_components=12, random_state=1)\n", "features_nmf = n_mf.fit_transform(feature_m)\n", "# Show results\n", "print(\"Original number of features:\", feature_m.shape[1])\n", "print(\"Reduced number of features:\", features_nmf.shape[1])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wrEYF9Ql8qtU", "outputId": "c28d28be-4f0b-4bd7-bb56-fde6ead38a45" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original number of features: 64\n", "Reduced number of features: 12\n" ] } ], "source": [ "# data loading\n", "digit123 = datasets.load_digits()\n", "# feature matrix Standardization\n", "features_m = StandardScaler().fit_transform(digit123.data)\n", "# sparse matrix creation\n", "f_sparse = csr_matrix(features_m)\n", "# TSVD creation\n", "tsvd = TruncatedSVD(n_components=12)\n", "# sparse matrix TSVD\n", "features_sp_tsvd = tsvd.fit(f_sparse).transform(f_sparse)\n", "# results\n", "print(\"Original number of features:\", f_sparse.shape[1])\n", "print(\"Reduced number of features:\", features_sp_tsvd.shape[1])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xRQ_nUf_8sZA", "outputId": "19b8d99c-b330-406d-e728-407c18d82f20" }, "outputs": [ { "data": { "text/plain": [ "0.3003938539283667" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sum of first three components' explained variance ratios\n", "tsvd.explained_variance_ratio_[0:3].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zbExVkXp8vpi" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "DimentionalityReductionUsingFeatureExtraction_PythonCodeTutorial.ipynb", "provenance": [] }, "interpreter": { "hash": "f89a88aed07bbcd763ac68893150ace71e487877d8c6527a76855322f20001c6" }, "kernelspec": { "display_name": "Python 3.9.12 64-bit", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 0 }