Column Transformer with Mixed Types

This example illustrates how to apply different preprocessing and feature extraction pipelines to different subsets of features, using sklearn.compose.ColumnTransformer. This is particularly handy for the case of datasets that contain heterogeneous data types, since we may want to scale the numeric features and one-hot encode the categorical ones.

In this example, the numeric data is standard-scaled after mean-imputation, while the categorical data is one-hot encoded after imputing missing values with a new category ('missing').

Finally, the preprocessing pipeline is integrated in a full prediction pipeline using sklearn.pipeline.Pipeline, together with a simple classification model.

# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
Traceback (most recent call last):
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/examples/compose/plot_column_transformer_mixed_types.py", line 40, in <module>
    X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 639, in fetch_openml
    data_info = _get_data_info_by_name(name, version, data_home)
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 377, in _get_data_info_by_name
    json_data = _get_json_content_from_openml_api(url, None, False,
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 159, in _get_json_content_from_openml_api
    return _load_json()
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 59, in wrapper
    return f()
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 155, in _load_json
    with closing(_open_openml_url(url, data_home)) as response:
  File "/build/scikit-learn-WhGJX7/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 104, in _open_openml_url
    with closing(urlopen(req)) as fsrc:
  File "/usr/lib/python3.8/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/usr/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/usr/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.8/urllib/request.py", line 1369, in https_open
    return self.do_open(http.client.HTTPSConnection, req,
  File "/usr/lib/python3.8/urllib/request.py", line 1329, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -2] Name or service not known>