{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.获取数据\n",
    "order_product = pd.read_csv(\"./data/instacart/order_products__prior.csv\")\n",
    "products = pd.read_csv(\"./data/instacart/products.csv\")\n",
    "orders = pd.read_csv(\"./data/instacart/orders.csv\")\n",
    "aisles = pd.read_csv(\"./data/instacart/aisles.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.数据基本处理  数据集合并 交叉表\n",
    "\n",
    "# 3.特征工程 PCA\n",
    "\n",
    "# 4.机器学习 聚类\n",
    "\n",
    "# 5.模型评估  轮廓系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.数据基本处理  数据集合并 交叉表\n",
    "data = pd.merge(left=order_product, right=orders, on=['order_id'], how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(left=data, right=products, on=['product_id'], how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(left=data, right=aisles, on=['aisle_id'], how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>order_id</th>\n",
       "      <th>product_id</th>\n",
       "      <th>add_to_cart_order</th>\n",
       "      <th>reordered</th>\n",
       "      <th>user_id</th>\n",
       "      <th>eval_set</th>\n",
       "      <th>order_number</th>\n",
       "      <th>order_dow</th>\n",
       "      <th>order_hour_of_day</th>\n",
       "      <th>days_since_prior_order</th>\n",
       "      <th>product_name</th>\n",
       "      <th>aisle_id</th>\n",
       "      <th>department_id</th>\n",
       "      <th>aisle</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>33120</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>202279</td>\n",
       "      <td>prior</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>9</td>\n",
       "      <td>8.0</td>\n",
       "      <td>Organic Egg Whites</td>\n",
       "      <td>86</td>\n",
       "      <td>16</td>\n",
       "      <td>eggs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>26</td>\n",
       "      <td>33120</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>153404</td>\n",
       "      <td>prior</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>7.0</td>\n",
       "      <td>Organic Egg Whites</td>\n",
       "      <td>86</td>\n",
       "      <td>16</td>\n",
       "      <td>eggs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>120</td>\n",
       "      <td>33120</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>23750</td>\n",
       "      <td>prior</td>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "      <td>10.0</td>\n",
       "      <td>Organic Egg Whites</td>\n",
       "      <td>86</td>\n",
       "      <td>16</td>\n",
       "      <td>eggs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>327</td>\n",
       "      <td>33120</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>58707</td>\n",
       "      <td>prior</td>\n",
       "      <td>21</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>8.0</td>\n",
       "      <td>Organic Egg Whites</td>\n",
       "      <td>86</td>\n",
       "      <td>16</td>\n",
       "      <td>eggs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>390</td>\n",
       "      <td>33120</td>\n",
       "      <td>28</td>\n",
       "      <td>1</td>\n",
       "      <td>166654</td>\n",
       "      <td>prior</td>\n",
       "      <td>48</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>9.0</td>\n",
       "      <td>Organic Egg Whites</td>\n",
       "      <td>86</td>\n",
       "      <td>16</td>\n",
       "      <td>eggs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   order_id  product_id  add_to_cart_order  reordered  user_id eval_set  \\\n",
       "0         2       33120                  1          1   202279    prior   \n",
       "1        26       33120                  5          0   153404    prior   \n",
       "2       120       33120                 13          0    23750    prior   \n",
       "3       327       33120                  5          1    58707    prior   \n",
       "4       390       33120                 28          1   166654    prior   \n",
       "\n",
       "   order_number  order_dow  order_hour_of_day  days_since_prior_order  \\\n",
       "0             3          5                  9                     8.0   \n",
       "1             2          0                 16                     7.0   \n",
       "2            11          6                  8                    10.0   \n",
       "3            21          6                  9                     8.0   \n",
       "4            48          0                 12                     9.0   \n",
       "\n",
       "         product_name  aisle_id  department_id aisle  \n",
       "0  Organic Egg Whites        86             16  eggs  \n",
       "1  Organic Egg Whites        86             16  eggs  \n",
       "2  Organic Egg Whites        86             16  eggs  \n",
       "3  Organic Egg Whites        86             16  eggs  \n",
       "4  Organic Egg Whites        86             16  eggs  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 建立交叉表\n",
    "data = pd.crosstab(data['user_id'], data['aisle'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.特征工程 PCA\n",
    "transfer = PCA(n_components=0.9)\n",
    "data = transfer.fit_transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n",
       "    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',\n",
       "    random_state=None, tol=0.0001, verbose=0)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4.机器学习 聚类\n",
    "x = data[:1000, :]\n",
    "# 4.1 创建模型\n",
    "estimator = KMeans(n_clusters=8)\n",
    "# 4.2 训练模型\n",
    "estimator.fit(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3313898637995612"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 5.模型评估  轮廓系数\n",
    "y_predict = estimator.predict(x)\n",
    "silhouette_score(x, y_predict) # 平均轮廓系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
