{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "uq9k8YYUKjnp" }, "outputs": [], "source": [ "import os\n", "import urllib.request\n", "import zipfile\n", "import json\n", "import pandas as pd\n", "import time\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "L5h3Tsa0LIoo" }, "outputs": [], "source": [ "def unzip_archive(filepath, dir_path):\n", " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n", " zip_ref.extractall(dir_path)\n", "\n", "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import shutil\n", "\n", "def make_dir(directory):\n", " if os.path.exists(directory):\n", " shutil.rmtree(directory)\n", " os.makedirs(directory)\n", " else:\n", " os.makedirs(directory)\n", " \n", "directory = os.getcwd() + '/data/raw/data'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " 'name',\n", " 'pid',\n", " 'num_followers',\n", " 'pos',\n", " 'artist_name',\n", " 'track_name',\n", " 'album_name'\n", "]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qyCujIu8cDGg", "outputId": "0964ace3-2916-49e3-eebf-2e08e61d95d9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mpd.slice.188000-188999.json\t100/1000\t10.0%" ] } ], "source": [ "\n", "directory = os.getcwd() + '/data/raw/playlists/data'\n", "df = pd.DataFrame()\n", "index = 0\n", "# Loop through all files in the directory\n", "for filename in os.listdir(directory):\n", " # Check if the item is a file (not a subdirectory)\n", " if os.path.isfile(os.path.join(directory, filename)):\n", " if filename.find('.json') != -1 :\n", " index += 1\n", "\n", " # Print the filename or perform operations on the file\n", " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n", "\n", " # If you need the full file path, you can use:\n", " full_path = os.path.join(directory, filename)\n", "\n", " with open(full_path, 'r') as file:\n", " json_data = json.load(file)\n", "\n", " temp = pd.DataFrame(json_data['playlists'])\n", " expanded_df = temp.explode('tracks').reset_index(drop=True)\n", "\n", " # Normalize the JSON data\n", " json_normalized = pd.json_normalize(expanded_df['tracks'])\n", "\n", " # Concatenate the original DataFrame with the normalized JSON data\n", " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n", " \n", " result = result[cols]\n", "\n", " df = pd.concat([df, result], axis=0, ignore_index=True)\n", " \n", " if index % 50 == 0:\n", " df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n", " del df\n", " df = pd.DataFrame()\n", " if index % 100 == 0:\n", " break" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "\n", "def read_parquet_folder(folder_path):\n", " dataframes = []\n", " for file in os.listdir(folder_path):\n", " if file.endswith('.parquet'):\n", " file_path = os.path.join(folder_path, file)\n", " df = pd.read_parquet(file_path)\n", " dataframes.append(df)\n", " \n", " return pd.concat(dataframes, ignore_index=True)\n", "\n", "folder_path = os.getcwd() + '/data/raw/data'\n", "df = read_parquet_folder(folder_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/mappings'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def create_ids(df, col, name):\n", " # Create a dictionary mapping unique values to IDs\n", " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n", "\n", " # Create a new column with the IDs\n", " df[f'{name}_id'] = df[col].map(value_to_id)\n", " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n", " # df = df.drop(col, axis=1)\n", " return df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df = create_ids(df, 'artist_name', 'artist')\n", "df = create_ids(df, 'pid', 'playlist')\n", "df = create_ids(df, 'track_name', 'song')\n", "df = create_ids(df, 'album_name', 'album')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df['artist_count'] = df.groupby(['playlist_id','artist_id'])['song_id'].transform('nunique')\n", "df['album_count'] = df.groupby(['playlist_id','artist_id'])['album_id'].transform('nunique')\n", "df['song_count'] = df.groupby(['playlist_id','artist_id'])['song_id'].transform('count')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df['playlist_songs'] = df.groupby(['playlist_id'])['pos'].transform('max')\n", "df['playlist_songs'] += 1" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n", "df['song_percent'] = df['song_count'] / df['playlist_songs']\n", "df['album_percent'] = df['album_count'] / df['playlist_songs']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | name | \n", "pid | \n", "num_followers | \n", "pos | \n", "artist_name | \n", "track_name | \n", "album_name | \n", "artist_id | \n", "playlist_id | \n", "song_id | \n", "album_id | \n", "artist_count | \n", "album_count | \n", "song_count | \n", "playlist_songs | \n", "artist_percent | \n", "song_percent | \n", "album_percent | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 212 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "0 | \n", "R. Kelly | \n", "Ignition - Remix | \n", "Chocolate Factory | \n", "108 | \n", "5 | \n", "203 | \n", "152 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 213 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "1 | \n", "Backstreet Boys | \n", "I Want It That Way | \n", "Original Album Classics | \n", "109 | \n", "5 | \n", "204 | \n", "153 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 214 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "2 | \n", "*NSYNC | \n", "Bye Bye Bye | \n", "No Strings Attached | \n", "110 | \n", "5 | \n", "205 | \n", "154 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 215 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "3 | \n", "Fountains Of Wayne | \n", "Stacy's Mom | \n", "Welcome Interstate Managers | \n", "111 | \n", "5 | \n", "206 | \n", "155 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 216 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "4 | \n", "Bowling For Soup | \n", "1985 | \n", "A Hangover You Don't Deserve | \n", "112 | \n", "5 | \n", "207 | \n", "156 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 400 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "188 | \n", "JoJo | \n", "Too Little, Too Late - Radio Version | \n", "Too Little, Too Late | \n", "199 | \n", "5 | \n", "390 | \n", "293 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 401 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "189 | \n", "Spice Girls | \n", "Wannabe - Radio Edit | \n", "Spice | \n", "200 | \n", "5 | \n", "391 | \n", "294 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 402 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "190 | \n", "MiMS | \n", "This Is Why I'm Hot | \n", "Music Is My Savior | \n", "201 | \n", "5 | \n", "392 | \n", "295 | \n", "1 | \n", "1 | \n", "1 | \n", "193 | \n", "0.005181 | \n", "0.005181 | \n", "0.005181 | \n", "
| 403 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "191 | \n", "Rihanna | \n", "Disturbia | \n", "Good Girl Gone Bad | \n", "115 | \n", "5 | \n", "393 | \n", "296 | \n", "3 | \n", "3 | \n", "3 | \n", "193 | \n", "0.015544 | \n", "0.015544 | \n", "0.015544 | \n", "
| 404 | \n", "throwbacks | \n", "143005 | \n", "2 | \n", "192 | \n", "DEV | \n", "Bass Down Low | \n", "The Night The Sun Came Up | \n", "179 | \n", "5 | \n", "394 | \n", "264 | \n", "2 | \n", "1 | \n", "2 | \n", "193 | \n", "0.010363 | \n", "0.010363 | \n", "0.005181 | \n", "
193 rows × 18 columns
\n", "| \n", " | playlist_id | \n", "artist_id | \n", "artist_percent | \n", "
|---|---|---|---|
| 0 | \n", "0 | \n", "0 | \n", "0.571429 | \n", "
| 1 | \n", "0 | \n", "0 | \n", "0.571429 | \n", "
| 2 | \n", "0 | \n", "0 | \n", "0.571429 | \n", "
| 3 | \n", "0 | \n", "0 | \n", "0.571429 | \n", "
| 4 | \n", "0 | \n", "0 | \n", "0.571429 | \n", "