井出草平の研究ノート

ロジットモデルのForward StabilityとModel Path Selection

データの読み込み

こちら(https://ides.hatenablog.com/entry/2024/06/21/235127)で使った心臓病のデータを使用する。

library(dplyr)

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", 
                  "exang", "oldpeak", "slope", "ca", "thal", "num")
heart_data <- read.csv(url, header = FALSE, col.names = column_names, na.strings = "?")

heart_data <- heart_data %>%
  mutate(class = ifelse(num > 0, 1, 0)) %>%  # num > 0 を心臓病ありとする
  select(-num) %>%  # 使用しない変数を除去
  na.omit()  # 欠損値を除去

Forward Stability

library(ModelPath)
# Forward Stabilityの実行
logit_fs <- full.select.gen(myframe = heart_data, 
                          resp.name = 'class', depth = 3, r = 100, 
                          model = "glm", fun.args =list(family = "binomial"), 
                          condense = FALSE)
print(logit_fs)
build.tree(logit_fs)  # グラフのプロット

結果。

      [,1]   [,2]      [,3]      
 [1,] "fbs"  "chol"    "restecg" 
 [2,] "fbs"  "chol"    "trestbps"
 [3,] "fbs"  "chol"    "age"     
 [4,] "fbs"  "restecg" "chol"    
 [5,] "fbs"  "restecg" "trestbps"
 [6,] "fbs"  "age"     "trestbps"
 [7,] "fbs"  "age"     "chol"    
 [8,] "fbs"  "age"     "restecg" 
 [9,] "chol" "fbs"     "restecg" 
[10,] "chol" "fbs"     "trestbps"
[11,] "chol" "restecg" "fbs"     
[12,] "chol" "restecg" "trestbps"
[13,] "chol" "restecg" "age"

Model Path Selection (MPS)

logit_mps <- full.select.gen(myframe = heart_data,
                           resp.name = 'class', depth = 3, r = 100, 
                           model = "glm", fun.args =list(family = "binomial"), 
                           condense = TRUE)
print(logit_mps)
build.tree(logit_mps)  # グラフのプロット

結果。

      [,1]   [,2]       [,3]      
 [1,] "fbs"  "chol"     "restecg" 
 [2,] "fbs"  "chol"     "trestbps"
 [3,] "fbs"  "chol"     "age"     
 [4,] "fbs"  "restecg"  "trestbps"
 [5,] "fbs"  "age"      "restecg" 
 [6,] "fbs"  "age"      "trestbps"
 [7,] "chol" "restecg"  "age"     
 [8,] "chol" "trestbps" "restecg" 
 [9,] "chol" "trestbps" "age"  

ソースコード(https://github.com/nkissel/MPS/blob/main/kisselmentch2021/applications.R)をみるとbinomialの指定はmodel = "glm", fun.args = list(family = binomial())となっている。今回はmodel = "glm", fun.args =list(family = "binomial")を使ったがどちらでも走るようではある。argementの記法としてはlist(family = "binomial")の方が使われることが多いと思うが、どちらでも走るのでどうでもよいことかもしれない。