{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Feature selection\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from psopt import Combination\n\n# to run this, make sure to have scikit-learn installed\nfrom sklearn.ensemble import RandomForestClassifier as RFC\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import cross_validate as cv\nfrom sklearn.datasets import load_breast_cancer\n\n\ndef main():\n    # loading breast cancer dataset\n    dataset = load_breast_cancer()\n\n    seed = 5\n\n    # train-test split\n    train_x, test_x, train_Y, test_Y = train_test_split(\n        dataset.data, dataset.target, test_size=0.2, random_state=seed\n    )\n\n    # create objective function\n    def evaluate(solution):\n        results = cv(RFC(n_estimators=10), train_x[:, solution], train_Y, cv=3)\n        return results[\"test_score\"].mean()\n\n    # instantiate optimizer\n    opt = Combination(evaluate, list(range(train_x.shape[1])), labels=dataset.feature_names)\n\n    # maximize obj function\n    result = opt.maximize(selection_size=15, verbose=True, max_iter=20, seed=seed)\n\n    # result.solution will have the same effect if labels are not provided to the optimizer\n    solution = [\n        i\n        for i in range(len(dataset.feature_names))\n        if dataset.feature_names[i] in result.solution\n    ]\n\n    # ======================== COMPARISON ========================\n\n    original = RFC().fit(train_x, train_Y)\n    optimized = RFC().fit(train_x[:, solution], train_Y)\n\n    print(\"\\nTest accuracy\\n--------------------------\")\n    print(\"All columns: {:.3f}\".format(original.score(test_x, test_Y)))\n    print(\"Solution:  {:.3f}\".format(optimized.score(test_x[:, solution], test_Y)))\n\n\nif __name__ == \"__main__\":\n    main()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}