{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install ЧегоНеХватает" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n", "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", "You can install the OpenMP library by the following command: ``brew install libomp``.\n", " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import lightgbm # сожрет все сырым и построит регрессионную модель, которая покажет важные фичи\n", " # чтобы дальше делать лабу только на них\n", " # если не можешь установить, то важные фичи в ячейке [8]\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.neighbors import KNeighborsRegressor" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bulk_idspalendate1start_squarevaluepricemean_sqmean_flplan_splan_m...До промки(км)До парка(км)До парка пешком(км)Станций метро от кольцаПлощадь двораКурсCтавка по ипотекеВклады до 1 годаВклады от 1 года до 3 летВклады свыше 3 лет
id
0FF3814A9-9F7D-E711-8530-00505688958B12017-10-0113109.91124.5135633.91304338.008147198...0.280.580.938764057.69805210.045.745.946.03
1FF3814A9-9F7D-E711-8530-00505688958B22017-10-0119898.41412.3128492.41982558.019139120...0.280.580.938764057.69805210.045.745.946.03
2FF3814A9-9F7D-E711-8530-00505688958B02017-10-012100.0345.0158237.63440922.58107815...0.280.580.938764057.69805210.045.745.946.03
3FF3814A9-9F7D-E711-8530-00505688958B32017-10-019614.9769.4123250.00000078.818500...0.280.580.938764057.69805210.045.745.946.03
4FF3814A9-9F7D-E711-8530-00505688958B12017-11-0111947.2526.7139420.63492137.939140175...0.280.580.938764058.9265529.875.177.076.20
\n", "

5 rows × 55 columns

\n", "
" ], "text/plain": [ " bulk_id spalen date1 start_square \\\n", "id \n", "0 FF3814A9-9F7D-E711-8530-00505688958B 1 2017-10-01 13109.9 \n", "1 FF3814A9-9F7D-E711-8530-00505688958B 2 2017-10-01 19898.4 \n", "2 FF3814A9-9F7D-E711-8530-00505688958B 0 2017-10-01 2100.0 \n", "3 FF3814A9-9F7D-E711-8530-00505688958B 3 2017-10-01 9614.9 \n", "4 FF3814A9-9F7D-E711-8530-00505688958B 1 2017-11-01 11947.2 \n", "\n", " value price mean_sq mean_fl plan_s plan_m \\\n", "id \n", "0 1124.5 135633.913043 38.00 8 147 198 \n", "1 1412.3 128492.419825 58.01 9 139 120 \n", "2 345.0 158237.634409 22.58 10 78 15 \n", "3 769.4 123250.000000 78.81 8 50 0 \n", "4 526.7 139420.634921 37.93 9 140 175 \n", "\n", " ... До промки(км) До парка(км) До парка пешком(км) \\\n", "id ... \n", "0 ... 0.28 0.58 0.93 \n", "1 ... 0.28 0.58 0.93 \n", "2 ... 0.28 0.58 0.93 \n", "3 ... 0.28 0.58 0.93 \n", "4 ... 0.28 0.58 0.93 \n", "\n", " Станций метро от кольца Площадь двора Курс Cтавка по ипотеке \\\n", "id \n", "0 8 7640 57.698052 10.04 \n", "1 8 7640 57.698052 10.04 \n", "2 8 7640 57.698052 10.04 \n", "3 8 7640 57.698052 10.04 \n", "4 8 7640 58.926552 9.87 \n", "\n", " Вклады до 1 года Вклады от 1 года до 3 лет Вклады свыше 3 лет \n", "id \n", "0 5.74 5.94 6.03 \n", "1 5.74 5.94 6.03 \n", "2 5.74 5.94 6.03 \n", "3 5.74 5.94 6.03 \n", "4 5.17 7.07 6.20 \n", "\n", "[5 rows x 55 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('PIK_Property.csv', encoding='cp1251', index_col=0)\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "целевая переменная value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "тут нет пропущенных значений, сделаем сами:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bulk_idspalendate1start_squarevaluepricemean_sqmean_flplan_splan_m...До промки(км)До парка(км)До парка пешком(км)Станций метро от кольцаПлощадь двораКурсCтавка по ипотекеВклады до 1 годаВклады от 1 года до 3 летВклады свыше 3 лет
id
0NaN12017-10-0113109.91124.5135633.913043NaN8147198...0.280.580.938764057.69805210.045.745.946.03
1FF3814A9-9F7D-E711-8530-00505688958B22017-10-0119898.41412.3128492.41982558.019139120...0.280.580.938764057.69805210.045.745.946.03
2FF3814A9-9F7D-E711-8530-00505688958B02017-10-012100.0345.0158237.63440922.58107815...0.280.580.938764057.69805210.045.745.946.03
3FF3814A9-9F7D-E711-8530-00505688958B32017-10-019614.9769.4123250.00000078.818500...0.280.580.938764057.69805210.045.745.946.03
4FF3814A9-9F7D-E711-8530-00505688958B12017-11-0111947.2526.7139420.63492137.939140175...0.280.580.938764058.9265529.875.177.076.20
\n", "

5 rows × 55 columns

\n", "
" ], "text/plain": [ " bulk_id spalen date1 start_square \\\n", "id \n", "0 NaN 1 2017-10-01 13109.9 \n", "1 FF3814A9-9F7D-E711-8530-00505688958B 2 2017-10-01 19898.4 \n", "2 FF3814A9-9F7D-E711-8530-00505688958B 0 2017-10-01 2100.0 \n", "3 FF3814A9-9F7D-E711-8530-00505688958B 3 2017-10-01 9614.9 \n", "4 FF3814A9-9F7D-E711-8530-00505688958B 1 2017-11-01 11947.2 \n", "\n", " value price mean_sq mean_fl plan_s plan_m \\\n", "id \n", "0 1124.5 135633.913043 NaN 8 147 198 \n", "1 1412.3 128492.419825 58.01 9 139 120 \n", "2 345.0 158237.634409 22.58 10 78 15 \n", "3 769.4 123250.000000 78.81 8 50 0 \n", "4 526.7 139420.634921 37.93 9 140 175 \n", "\n", " ... До промки(км) До парка(км) До парка пешком(км) \\\n", "id ... \n", "0 ... 0.28 0.58 0.93 \n", "1 ... 0.28 0.58 0.93 \n", "2 ... 0.28 0.58 0.93 \n", "3 ... 0.28 0.58 0.93 \n", "4 ... 0.28 0.58 0.93 \n", "\n", " Станций метро от кольца Площадь двора Курс Cтавка по ипотеке \\\n", "id \n", "0 8 7640 57.698052 10.04 \n", "1 8 7640 57.698052 10.04 \n", "2 8 7640 57.698052 10.04 \n", "3 8 7640 57.698052 10.04 \n", "4 8 7640 58.926552 9.87 \n", "\n", " Вклады до 1 года Вклады от 1 года до 3 лет Вклады свыше 3 лет \n", "id \n", "0 5.74 5.94 6.03 \n", "1 5.74 5.94 6.03 \n", "2 5.74 5.94 6.03 \n", "3 5.74 5.94 6.03 \n", "4 5.17 7.07 6.20 \n", "\n", "[5 rows x 55 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[0::5, ['mean_sq','bulk_id']] = np.nan # каждая пятая строка содержит пропущенные\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "все колонки, которые не являются числами, указываем категориальными (чтобы lgbm их съел):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for column in df.select_dtypes(include = ['object']).columns.tolist():\n", " df[column] = df[column].astype('category')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "lgbm_regressor = lightgbm.LGBMRegressor().fit(df.loc[:, df.columns != 'value'], df['value'])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [], "source": [ "list_of_importances = list(zip(df.loc[:, df.columns != 'value'].columns.tolist(), \n", " lgbm_regressor.feature_importances_))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[('bulk_id', 366),\n", " ('start_square', 339),\n", " ('date1', 285),\n", " ('plan_l', 173),\n", " ('mean_sq', 168),\n", " ('plan_m', 155),\n", " ('vid_2', 131),\n", " ('price', 129),\n", " ('plan_s', 116),\n", " ('vid_1', 115),\n", " ('Площадь двора', 98),\n", " ('spalen', 84),\n", " ('До парка(км)', 64),\n", " ('До большой дороги на машине(км)', 58),\n", " ('vid_0', 53),\n", " ('mean_fl', 51),\n", " ('До метро пешком(км)', 46),\n", " ('month_cnt', 45),\n", " ('Площадь зеленой зоны в радиусе 500 м', 44),\n", " ('month', 43),\n", " ('Вклады свыше 3 лет', 40),\n", " ('Количество помещений', 39),\n", " ('Вклады от 1 года до 3 лет', 34),\n", " ('До удобной авторазвязки на машине(км)', 32),\n", " ('До парка пешком(км)', 28),\n", " ('До промки(км)', 25),\n", " ('Площадь пром. зоны в радиусе 500 м', 23),\n", " ('Cтавка по ипотеке', 22),\n", " ('Курс', 21),\n", " ('Детский сад', 19),\n", " ('Школа', 19),\n", " ('Вклады до 1 года', 18),\n", " ('Площадь земельного участка', 16),\n", " ('Машиномест', 15),\n", " ('До Садового(км)', 14),\n", " ('Станций метро от кольца', 14),\n", " ('До Кремля', 12),\n", " ('До ТТК(км)', 11),\n", " ('Класс объекта', 9),\n", " ('Двор без машин', 9),\n", " ('ФОК', 5),\n", " ('Вентлияция', 5),\n", " ('Поликлиника', 3),\n", " ('Видеонаблюдение', 3),\n", " ('Кладовые', 1),\n", " ('Огорожена территория', 0),\n", " ('Входные группы', 0),\n", " ('Спортивная площадка', 0),\n", " ('Автомойка', 0),\n", " ('Колясочные', 0),\n", " ('Кондиционирование', 0),\n", " ('Лифт', 0),\n", " ('Система мусоротведения', 0),\n", " ('Подземная парковка', 0)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(list_of_importances, key= lambda x: x[1], reverse= True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bulk_id',\n", " 'start_square',\n", " 'date1',\n", " 'plan_l',\n", " 'mean_sq',\n", " 'plan_m',\n", " 'vid_2',\n", " 'price',\n", " 'plan_s',\n", " 'vid_1',\n", " 'Площадь двора',\n", " 'spalen',\n", " 'До парка(км)',\n", " 'До большой дороги на машине(км)',\n", " 'vid_0',\n", " 'mean_fl',\n", " 'До метро пешком(км)',\n", " 'month_cnt',\n", " 'Площадь зеленой зоны в радиусе 500 м']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "important_features = [x[0] for x in sorted(list_of_importances, \n", " key= lambda x: x[1], \n", " reverse= True)[:19]]\n", "important_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "оставляем в датасете только важные фичи:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "important_features.extend(['value'])\n", "df = df[important_features]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Лаба 3: пропуски, кодирование, масштабирование" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "заполнение пропусков:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "for column in df.select_dtypes(include= ['int64', 'float64']).columns.tolist():\n", " df[column] = df[column].fillna(df[column].mean())\n", "for column in df.select_dtypes(include= ['category']).columns.tolist():\n", " df[column] = df[column].fillna(df[column].describe(include= ['category'])['top'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "кодирование категориальных:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "for column in df.select_dtypes(include= ['category']).columns.tolist():\n", " df = df.merge(df.groupby(column)['value'].mean().rename(str(column+'_value_mean')).to_frame(),\n", " how= 'left',\n", " left_on= column,\n", " right_index= True)\n", " df.drop(column, axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "масштабирование признаков:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df.loc[:, df.columns != 'value'] = df.loc[:, df.columns != 'value'].apply(lambda x: x/x.max(), \n", " axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Лаба 4: train-test, кросс-валидация, подбор параметров" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "трейн-тест:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'value'], \n", " df['value'], \n", " test_size= 0.1, \n", " random_state= 42)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "подбор лучшего параметра по кросс-валидации:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise-deprecating',\n", " estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", " weights='uniform'),\n", " fit_params=None, iid='warn', n_jobs=None,\n", " param_grid={'n_neighbors': [5, 10, 20, 50]},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", " scoring='neg_mean_absolute_error', verbose=0)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random_search = GridSearchCV(estimator= KNeighborsRegressor(),\n", " param_grid= {'n_neighbors': [5,10,20,50]},\n", " scoring= 'neg_mean_absolute_error', \n", " cv= 3)\n", "\n", "random_search.fit(df.loc[:, df.columns != 'value'], df['value'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'n_neighbors': 50}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random_search.best_params_" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "202.2916783427088" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "-random_search.best_score_ # минус потому что метрика 'negative_mean_absolute_error'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }