Anda di halaman 1dari 13

{

"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Let's perform analysis!\n",
"\n",
"Hello, I'm Dixhom. Here I talk about how to preform feature engineering, de
lete unwanted variables, build a model and make submission data! So this is a tu
torial for data science beginners. So let's get the ball rolling.\n",
"\n",
"(This is for a kaggle competition 'Kobe Bryant Shot Selection' (https://www
.kaggle.com/c/kobe-bryant-shot-selection))"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.cross_validation import KFold"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "IOError",
"evalue": "File C:\\Users\u0007jish\\PythonCoding\\KobeBryant.csv does not
exist",
"output_type": "error",
"traceback": [
"\u001b[1;31m--------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mIOError\u001b[0m
Traceback
(most recent call last)",
"\u001b[1;32m<ipython-input-2-cae419031746>\u001b[0m in \u001b[0;36m<modul
e>\u001b[1;34m()\u001b[0m\n\u001b[0;32m
1\u001b[0m \u001b[1;31m# import dat
a\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m
2\u001b[0m \u001b[0mfilename\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;34m\"C:\\U
sers\\ajish\\PythonCoding\\KobeBryant.csv\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[
0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mraw\u001b[0m \u001b[1;33m=\u001b[0m
\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33
m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m
\u001b[0m\n\u001b[0m",
"\u001b[1;32mC:\\Users\\ajish\\Anaconda2\\lib\\site-packages\\pandas\\io\\

parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep


, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_col
s, dtype, engine, converters, true_values, false_values, skipinitialspace, skipr
ows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_bla
nk_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfir
st, iterator, chunksize, compression, thousands, decimal, lineterminator, quotec
har, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_l
ines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, c
ompact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision
)\u001b[0m\n\u001b[0;32m
527\u001b[0m
skip_blank_lines=sk
ip_blank_lines)\n\u001b[0;32m
528\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u
001b[1;32m--> 529\u001b[1;33m
\u001b[1;32mreturn\u001b[0m \u001b[0m_read
\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,
\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001
b[0m\n\u001b[0m\u001b[0;32m
530\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u00
1b[0;32m
531\u001b[0m
\u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u0
01b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33
m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\ajish\\Anaconda2\\lib\\site-packages\\pandas\\io\\
parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\
u001b[0m\n\u001b[0;32m
293\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;
32m
294\u001b[0m
\u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u0
01b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 295\u001b[1;33m
\u001b
[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b
[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001
b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b
[0m\u001b[0m\n\u001b[0m\u001b[0;32m
296\u001b[0m \u001b[1;33m\u001b[0m\u001b[
0m\n\u001b[0;32m
297\u001b[0m
\u001b[1;32mif\u001b[0m \u001b[1;33m(\u001b
[0m\u001b[0mnrows\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u00
1b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;33m(\
u001b[0m\u001b[0mchunksize\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001
b[0m \u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;
33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\ajish\\Anaconda2\\lib\\site-packages\\pandas\\io\\
parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds
)\u001b[0m\n\u001b[0;32m
610\u001b[0m
\u001b[0mself\u001b[0m\u001
b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has
_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwd
s\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;3
3m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m
611\u001b[0m \u001b[
1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 612\u001b[1;33m
\u001b[0mself\u
001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0
m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33
m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m
613\u001b[0m
\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m
614\u001b[0m
\u001b[1;32mde
f\u001b[0m \u001b[0m_get_options_with_defaults\u001b[0m\u001b[1;33m(\u001b[0m\u0
01b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m)\
u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\ajish\\Anaconda2\\lib\\site-packages\\pandas\\io\\
parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b
[0m\n\u001b[0;32m
745\u001b[0m
\u001b[1;32mdef\u001b[0m \u001b[0m_make_en
gine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m
\u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;3
3m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m
746\u001b[0m
\u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1
;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b
[0m\u001b[0m\n\u001b[1;32m--> 747\u001b[1;33m
\u001b[0mself\u001b[0m
\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m
CParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.
\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b

[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u00
1b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m
748\u001b[0m
\u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m
\n\u001b[0;32m
749\u001b[0m
\u001b[1;32mif\u001b[0m \u001b[0mengi
ne\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u
001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\ajish\\Anaconda2\\lib\\site-packages\\pandas\\io\\
parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001
b[0m\n\u001b[0;32m 1117\u001b[0m
\u001b[0mkwds\u001b[0m\u001b[1;33m[\u
001b[0m\u001b[1;34m'allow_leading_cols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;
33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex_col\u00
1b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mFalse\u001b[0m\u
001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1118\u001b[0m \u001b[1;33m\u001b[0m
\u001b[0m\n\u001b[1;32m-> 1119\u001b[1;33m
\u001b[0mself\u001b[0m\u001b[
1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_parser
\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0
m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwd
s\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[
0;32m 1120\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1121\u001b[
0m
\u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\
u001b[0m\n",
"\u001b[1;32mpandas\\parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.Text
Reader.__cinit__ (pandas\\parser.c:3246)\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas\\parser.pyx\u001b[0m in \u001b[0;36mpandas.parser.Text
Reader._setup_parser_source (pandas\\parser.c:6111)\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mIOError\u001b[0m: File C:\\Users\u0007jish\\PythonCoding\\Kob
eBryant.csv does not exist"
]
}
],
"source": [
"# import data\n",
"filename= \"C:\\Users\\ajish\\PythonCoding\\KobeBryant.csv\"\n",
"raw = pd.read_csv(filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature engineering\n",
"Now let's start feature engineering. There are many features which should b
e modified or deleted for brevity. Let's take a look into variables.\n",
"\n",
"First, let's take a look at all the variables."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"raw.head()"
]
},
{
"cell_type": "markdown",

"metadata": {},
"source": [
"## Dropping nans\n",
"We are gonna make a variable without `nan` for our exploratory analysis. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"nona = raw[pd.notnull(raw['shot_made_flag'])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## loc_x, loc_y, lat and lon\n",
"What do these mean? From their names, these sound like **location_x, locati
on_y, latitude and longitude**. Let's confirm this assumption. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"alpha = 0.02\n",
"plt.figure(figsize=(10,10))\n",
"\n",
"# loc_x and loc_y\n",
"plt.subplot(121)\n",
"plt.scatter(nona.loc_x, nona.loc_y, color='blue', alpha=alpha)\n",
"plt.title('loc_x and loc_y')\n",
"\n",
"# lat and lon\n",
"plt.subplot(122)\n",
"plt.scatter(nona.lon, nona.lat, color='green', alpha=alpha)\n",
"plt.title('lat and lon')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"These plot are shaped like basket ball courts. So loc_x, loc_y, lat and lon
seem to mean the position from which the ball was tossed. However, since the re
gion under the net is half-circle-shaped, it would be more suitable to transform
the variable into **polar coodinate**."
]
},
{
"cell_type": "code",

"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)\n",
"\n",
"loc_x_zero = raw['loc_x'] == 0\n",
"raw['angle'] = np.array([0]*len(raw))\n",
"raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_
x'][~loc_x_zero])\n",
"raw['angle'][loc_x_zero] = np.pi / 2 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since some of loc_x values cause an error by zero-division, we set just `np
.pi / 2` to the corresponding rows.\n",
"\n",
"## minutes_remaining and seconds_remaining\n",
"`minutes_remaining` and `seconds_remaining` seem to be a pair, so let's com
bine them together."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remain
ing']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## action_type, combined_shot_type, shot_type\n",
"These represents how the player shot a ball."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(nona.action_type.unique())\n",
"print(nona.combined_shot_type.unique())\n",
"print(nona.shot_type.unique())"
]
},

{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Season\n",
"`Season` looks like consisting of two parts."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"nona['season'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`Season` seems to be composed of two parts: season year and season ID. Here
we only need season ID. Let's modify the data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )\n",
"raw['season'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## team_id and team_name\n",
"These contain the same one value for each. Seem useless. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(nona['team_id'].unique())\n",
"print(nona['team_name'].unique())"
]
},
{

"cell_type": "markdown",
"metadata": {},
"source": [
"## opponent , matchup\n",
"These are basically the same information. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pd.DataFrame({'matchup':nona.matchup, 'opponent':nona.opponent})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Only opponent is needed."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shot distance\n",
"We already defined this."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(5,5))\n",
"\n",
"plt.scatter(raw.dist, raw.shot_distance, color='blue')\n",
"plt.title('dist and shot_distance')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`shot_distance` is proportional to `dist` and this won't be necessary.\n",
"\n",
"## shot_zone_area, shot_zone_basic, shot_zone_range\n",
"These sound like some regions on the court, so let's visualize it."
]
},
{
"cell_type": "code",
"execution_count": null,

"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import matplotlib.cm as cm\n",
"plt.figure(figsize=(20,10))\n",
"\n",
"def scatter_plot_by_category(feat):\n",
"
alpha = 0.1\n",
"
gs = nona.groupby(feat)\n",
"
cs = cm.rainbow(np.linspace(0, 1, len(gs)))\n",
"
for g, c in zip(gs, cs):\n",
"
plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)\n",
"\n",
"# shot_zone_area\n",
"plt.subplot(131)\n",
"scatter_plot_by_category('shot_zone_area')\n",
"plt.title('shot_zone_area')\n",
"\n",
"# shot_zone_basic\n",
"plt.subplot(132)\n",
"scatter_plot_by_category('shot_zone_basic')\n",
"plt.title('shot_zone_basic')\n",
"\n",
"# shot_zone_range\n",
"plt.subplot(133)\n",
"scatter_plot_by_category('shot_zone_range')\n",
"plt.title('shot_zone_range')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we thought, these represent regions on the court. However, these regions
can be separated by `dist` and `angle`. So we don't need these."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## dropping unneeded variables\n",
"Let's drop unnecessary variables."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_ra
nge', 'shot_zone_basic', \\\n",
"
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining',
\\\n",
"
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'ga

me_date']\n",
"for drop in drops:\n",
"
raw = raw.drop(drop, 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## make dummy variables\n",
"We are going to use randomForest classifier for building our models but thi
s doesn't accept string variables like 'action_type'. So we are going to make du
mmy variables for those."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# turn categorical variables into dummy variables\n",
"categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'oppo
nent', 'period', 'season']\n",
"for var in categorical_vars:\n",
"
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)\n",
"
raw = raw.drop(var, 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## separating data for training and submission\n",
"Now let's separate data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = raw[pd.notnull(raw['shot_made_flag'])]\n",
"submission = raw[pd.isnull(raw['shot_made_flag'])]\n",
"submission = submission.drop('shot_made_flag', 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We are separating `df` further into explanatory and response variables."
]
},
{

"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# separate df into explanatory and response variables\n",
"train = df.drop('shot_made_flag', 1)\n",
"train_y = df['shot_made_flag']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## logloss\n",
"Submissions are evaluated on the log loss. We are going to use it for evalu
ating our model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import scipy as sp\n",
"def logloss(act, pred):\n",
"
epsilon = 1e-15\n",
"
pred = sp.maximum(epsilon, pred)\n",
"
pred = sp.minimum(1-epsilon, pred)\n",
"
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pre
d)))\n",
"
ll = ll * -1.0/len(act)\n",
"
return ll"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building a model\n",
"Now it's time to build a model. We use randomForest classifier and k-fold c
ross validation for testing our model.\n",
"We are going to...\n",
"\n",
"1. pick a `n` from `n_range` for the number of estimators in randomForestCl
assifier.\n",
"1. divide the training data into 10 pieces\n",
"2. pick 9 of them for building a model and use the remaining 1 for testing
a model\n",
"3. repeat the same process for the other 9 pieces.\n",
"4. calculate score for each and take an average of them\n",
"5. pick the next `n` and do the process again\n",
"6. find the `n` which gave the best score among `n_range`\n",
"7. repeat the same process with the tree depth parameter.\n",
"\n",

"You can change the value of `np.logspace` for searching optimum value in br
oader area."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import confusion_matrix\n",
"import time\n",
"\n",
"\n",
"# find the best n_estimators for RandomForestClassifier\n",
"print('Finding best n_estimators for RandomForestClassifier...')\n",
"min_score = 100000\n",
"best_n = 0\n",
"scores_n = []\n",
"range_n = np.logspace(0,2,num=3).astype(int)\n",
"for n in range_n:\n",
"
print(\"the number of trees : {0}\".format(n))\n",
"
t1 = time.time()\n",
"
\n",
"
rfc_score = 0.\n",
"
rfc = RandomForestClassifier(n_estimators=n)\n",
"
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):\n",
"
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])\n",
"
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/1
0\n",
"
pred = rfc.predict(train.iloc[test_k])\n",
"
rfc_score += logloss(train_y.iloc[test_k], pred) / 10\n",
"
scores_n.append(rfc_score)\n",
"
if rfc_score < min_score:\n",
"
min_score = rfc_score\n",
"
best_n = n\n",
"
\n",
"
t2 = time.time()\n",
"
print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))\n",
"print(best_n, min_score)\n",
"\n",
"\n",
"# find best max_depth for RandomForestClassifier\n",
"print('Finding best max_depth for RandomForestClassifier...')\n",
"min_score = 100000\n",
"best_m = 0\n",
"scores_m = []\n",
"range_m = np.logspace(0,2,num=3).astype(int)\n",
"for m in range_m:\n",
"
print(\"the max depth : {0}\".format(m))\n",
"
t1 = time.time()\n",
"
\n",
"
rfc_score = 0.\n",
"
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)\n",
"
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):\n",
"
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])\n",
"
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/1

0\n",
"
pred = rfc.predict(train.iloc[test_k])\n",
"
rfc_score += logloss(train_y.iloc[test_k], pred) / 10\n",
"
scores_m.append(rfc_score)\n",
"
if rfc_score < min_score:\n",
"
min_score = rfc_score\n",
"
best_m = m\n",
"
\n",
"
t2 = time.time()\n",
"
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))\n",
"print(best_m, min_score)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Visualizing parameters for randomForest\n",
"By visualizing the parameters, we can check if the chosen parameter is real
ly the best."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(10,5))\n",
"plt.subplot(121)\n",
"plt.plot(range_n, scores_n)\n",
"plt.ylabel('score')\n",
"plt.xlabel('number of trees')\n",
"\n",
"plt.subplot(122)\n",
"plt.plot(range_m, scores_m)\n",
"plt.ylabel('score')\n",
"plt.xlabel('max depth')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building a final model\n",
"Let's use the parameters we just got for the final model and prediction."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)\n",
"model.fit(train, train_y)\n",

"pred = model.predict_proba(submission)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Making submission data\n",
"Predicted shot_made_flag is written to a csv file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sub = pd.read_csv(\"../input/sample_submission.csv\")\n",
"sub['shot_made_flag'] = pred\n",
"sub.to_csv(\"real_submission.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Anda mungkin juga menyukai