R -- マルチアンサーのクロス集計

マルチアンサーのクロス集計　　　　　Last modified: Dec 12, 2012
目的

単一のカテゴリー変数と複数の該当・非該当の応答のある変数のクロス集計を行う

使用法

ma(i, j, df, latex=TRUE, caption=NULL, label=NULL, output="")

引数

i, j    データフレーム上で，クロス集計をする二変数が入っている，列の番号または変数名ベクトル
        該当・非該当により答える項目は i, j のどちらに指定してもよい
        実際には 1, 3:10 とか 12:17, 2 のように指定する
        注意：データフレーム上で，該当・非該当のデータの定義を間違わないように（使用例参照）
df      読み込んだデータフレームの名前
latex   LaTeX ソースを出力する。Word 用なら，latex=FALSE にする
caption latex=TRUE の場合，各表の表題を文字列ベクトルとして指定することができる（デフォルトではあり合わせの表題を付ける）。
label   latex=TRUE の場合，各表の label を文字列ベクトルとして指定することができる（デフォルトでは付けない）。
output  出力コネクション
        デフルトはコンソールに出力

ソース

インストールは，以下の 1 行をコピーし，R コンソールにペーストする
source("http://aoki2.si.gunma-u.ac.jp/R/src/multianswer.R", encoding="euc-jp")

#####
#
# マルチアンサー項目と別の 1 つの変数についてクロス集計表を作成する
#
#####

ma <- function(i,                                                            # 表側に来る変数が入っているデータフレーム上の列番号または変数名ベクトル
               j,                                                               # 表側に来る変数が入っているデータフレーム上の列番号または変数名ベクトル
                                                                                # i, j いずれかがマルチアンサー項目を表すのでベクトルになる
               df,                                                              # データフレーム
               latex=TRUE,                                                      # LaTeX 形式で度数分布表を出力する（デフォルトは LaTeX 形式）
               caption=NULL,                                                    # latex=TRUE のときに，各表の表題を表す文字列ベクトルを指定できる（NULL のときはデフォルトの表題）
               label=NULL,                                                      # latex=TRUE のときに，各表の label を表す文字列ベクトルを指定できる（NULL のときは付けない）
               output="",                                                       # ファイルに出力するときはファイル名（デフォルトはコンソールに出力）
               encoding=getOption("encoding"))                                  # ファイルに出力するときのエンコーディング（デフォルトは OS による）
{

# 下請け関数
        is.error <- function(k)                                                      # k 列目の変数が factor であり，かつ，二値変数でなければならない
        {
                if (!is.factor(df[,k])) {                                       # factor でない
                        warning(sprintf("%s を factor にしてください", colnames(df)[k]))
                        return(TRUE)
                }
                else if (nlevels(df[,k]) != 2) {                                # 二値変数でない
                        warning(sprintf("%s が二値データではありません", colnames(df)[k]))
                        return(TRUE)
                }
                return(FALSE)
        }

        getNum <- function(str, df) {                                                # 変数名から列番号を得る
                names <- colnames(df)
                seq_along(names)[names %in% str]
        }

# 関数本体
        if (output != "") {                                                     # 結果をファイルに出力する場合の処理
                output <- file(output, open="w", encoding=encoding)
        }
        if (is.character(i[1])) {
                i <- getNum(i, df)
        }
        if (is.character(j[1])) {
                j <- getNum(j, df)
        }
        if (length(i) == 1 && length(j) > 1) {                                    # i の方が普通の変数（1 つ），j の方がマルチアンサー変数（複数）のとき
                row <- TRUE                                                  # 表頭にマルチアンサー項目，row = TRUE として識別
        }
        else if (length(i) > 1 && length(j) == 1) {                               # i の方がマルチアンサー変数（複数），j  の方が普通の変数（1 つ）のとき
                row <- FALSE                                                 # 表側にマルチアンサー項目，row = FALSE として識別
                temp <- i                                                    # row == TRUE のときと同じように処理するために，i, j の内容を入れ替える
                i <- j
                j <- temp
        }
        else {                                                                  # 普通の変数が複数個指定されるような場合は想定していない
                stop("このプログラムでは，i, j いずれかが要素 1 のスカラー，他方が要素2以上のベクトルであることを仮定しています。使用法に誤りがあります")
        }
        if (any(sapply(j, is.error))) stop("データの準備に問題があります")      # マルチアンサー変数のチェック

        df.i <- as.factor(df[,i])                                            # 普通の変数の方も factor にされている方が望ましいが，そうではないときのために
        ans <- sapply(j, function(k) table(df[,i], df[,k]))[1:nlevels(df.i),]        # マルチアンサー集計表本体を作る
        ans <- cbind(ans, table(df.i))                                               # 合計列を作る（普通の変数の方の度数分布）
        ans <- rbind(ans, colSums(ans))                                              # 合計行を作る
        rownames(ans) <- c(levels(df.i), "合計")                             # 行の名前
        colnames(ans) <- c(colnames(df[,j]), "該当数")                               # 列の名前
        nc <- ncol(ans)                                                              # 列数
        pct <- ans/ans[,nc]*100                                                      # 行方向パーセント
        if (!row) {                                                             # row == FALSE の場合に，
                ans <- t(ans)                                                        # ans を転置
                pct <- t(pct)                                                        # pct を転置
                nc <- ncol(ans)                                                      # 転置後の列数を再計算
        }
        nr <- nrow(ans)                                                              # 最終時点の集計表の行数
        if (latex) {                                                            # LaTeX 形式で集計結果を出力する
                cat("\\begin{table}[htbp]\n", file=output)                      # \begin{table}[htbp] 
                if (is.null(caption)) {
                        cat("\\caption{マルチアンサー項目の集計}\n", file=output) # \caption{マルチアンサー項目の集計}
                }
                else {
                        cat(sprintf("\\caption{%s}\n", caption), file=output)   # 指定した表題
                }
                if (!is.null(label)) {
                        cat(sprintf("\\label{%s}\n", label), file=output)       # 指定したラベル
                }
                cat("\\centering\n", file=output)                                                                       # \centering
                cat("\\begin{tabular}{l", rep("c", nc), "} \\hline\n", sep="", file=output)                             # \begin{tabular}{cc…c} \hline
                if (row) {                                                                                              # 表頭にマルチアンサー項目
                        cat(colnames(df)[i], colnames(ans), sep=" & ", file=output)                                  # 変数名 & マルチアンサー変数i & …
                        cat("\\\\ \\hline\n", file=output)                                                              # \\ \hline
                        for (i in 1:nr) {                                                                               # 各行について，
                                cat(rownames(ans)[i], ans[i,], sep=" & ", file=output)                                       # 行名 & 集計数i & …
                                cat("\\\\\n", file=output)                                                              # \\
                                cat("\\%", sprintf("{\\small \\textit{%.1f}}", pct[i,-nc]), sep=" & ", file=output)  # % & パーセントi & …
                                cat("\\\\", file=output)                                                                # \\
                                if (i >= nr-1) cat("\\hline\n", file=output) else cat("\n", file=output)                # 最終行の前なら \\
                        }
                }
                else {                                                                                                  # 表側にマルチアンサー項目
                        cat(sprintf("& \\multicolumn{%i}{c}{%s}\\\\ \\cline{2-%i}\n", nc-1, colnames(df)[i], nc),    # マルチアンサーではない方の変数名
                            file=output)
                        cat("", colnames(ans), sep=" & ", file=output)                                                       # 列名（変数のカテゴリー名）
                        cat("\\\\ \\hline\n", file=output)                                                              # \\ \hline
                        for (i in 1:nr) {                                                                               # 各行（マルチアンサー項目）ごとに，
                                cat(rownames(ans)[i], ans[i,], sep=" & ", file=output)                                       # 行名 & 集計数i & …
                                cat("\\\\\n", file=output)                                                              # \\
                                if (i < nr) {                                                                                # 最終行でないときは，
                                        cat("\\%", sprintf("{\\small \\textit{%.1f}}", pct[i,]), sep=" & ", file=output)# % & パーセントi & …
                                        cat("\\\\", file=output)                                                        # \\
                                }
                                if (i >= nr-1) cat("\\hline\n", file=output) else cat("\n", file=output)                # 最終行の前なら \\
                        }
                }
                cat("\\end{tabular}\n", file=output)                                                                    # \end{tabular}
                cat("\\end{table}\n", file=output)                                                                      # \end{table}
        }
        else {                                                                  # tab で区切って出力する
                cat("表　マルチアンサー項目の集計\n", file=output)                                                      # マルチアンサー項目の集計
                if (row) {
                        cat("\n", file=output, fill=TRUE)
                        cat(colnames(df)[i], colnames(ans), sep="\t", file=output, fill=TRUE)                           # 変数名　マルチアンサー変数i …
                        for (i in 1:nr) {                                                                               # 各行について，
                                cat(rownames(ans)[i], ans[i,], sep="\t", file=output, fill=TRUE)                        # 行名　　集計数i　　　…
                                cat("%", sprintf("%.1f", pct[i,-nc]), sep="\t", file=output, fill=TRUE)                 # % 　　　パーセントi　…
                        }
                }
                else {                                                                                                  # 表側にマルチアンサー項目を置く場合
                        cat("\n", colnames(df)[i], sep="\t", file=output, fill=TRUE)                                    # マルチアンサーではない方の変数名
                        cat("", colnames(ans), sep="\t", file=output, fill=TRUE)                                        # 列名（変数のカテゴリー名）
                        for (i in 1:nr) {                                                                               # 各行（マルチアンサー項目）ごとに，
                                cat(rownames(ans)[i], ans[i,], sep="\t", file=output, fill=TRUE)                        # 行名　集計数i　…
                                if (i < nr) {                                                                                # 最終行でないときは，
                                        cat("%", sprintf("%.1f", pct[i,]), sep="\t", file=output, fill=TRUE)            # % 　　パーセントi　…
                                }
                        }
                }
        }
        if (output != "") {                                                     # 結果をファイルに出力した場合の後始末
                close(output)
        }
}


使用例

性別	血液型	項目1	項目2	項目3	項目4	項目5
1	1	1	1	0	1	1
2	2	0	1	0	1	1
1	3	0	0	0	0	1
1	4	1	1	1	1	1
1	1	1	0	0	1	0
2	2	0	1	1	1	1
2	3	1	1	0	1	0
1	3	1	1	1	1	0
2	2	1	1	0	0	0
2	1	0	0	1	0	0
のようなファイル test.dat があるとする

入力と変数の定義
df <- read.table("test.dat", header=TRUE)
性別と血液型は通常のカテゴリー変数
df[,1] <- factor(df[,1], levels=1:2, labels=c("男", "女"))
df[,2] <- factor(df[,2], levels=1:4, labels=c("A", "B", "O", "AB"))
項目1～項目5 は該当なら1，非該当なら0で入力されている
集計したいことを表す数値（この例なら「該当」）を levels の1番目の数値にすること
for (i in 3:7) df[,i] <- factor(df[,i], levels=1:0, labels=c("該当", "非該当"))

コンソールに出力するとき
ma(2, 3:7, df)

ファイルに出力するとき
ma(2, 3:7, df, output="ファイル名", encoding="EUC-JP")

出力結果例

latex=FALSE の場合

> ma(2, 3:7, df, latex=FALSE) # 二番目の引数がマルチアンサー項目の場合

血液型	項目1	項目2	項目3	項目4	項目5	該当数
A	2	1	1	2	1	3
%	66.7	33.3	33.3	66.7	33.3
B	1	3	1	2	2	3
%	33.3	100.0	33.3	66.7	66.7
O	2	2	1	2	1	3
%	66.7	66.7	33.3	66.7	33.3
AB	1	1	1	1	1	1
%	100.0	100.0	100.0	100.0	100.0
合計	6	7	4	7	5	10
%	60.0	70.0	40.0	70.0	50.0

注：% は各項目に答えた数を右端にある該当数で割ったものである。
　　マルチアンサーのため，% を合計しても 100% にはならない。

> ma(3:7, 2, df, latex=FALSE) # 一番目の引数がマルチアンサー項目の場合

	血液型
	A	B	O	AB	合計
項目1	2	1	2	1	6
%	66.7	33.3	66.7	100.0	60.0
項目2	1	3	2	1	7
%	33.3	100.0	66.7	100.0	70.0
項目3	1	1	1	1	4
%	33.3	33.3	33.3	100.0	40.0
項目4	2	2	2	1	7
%	66.7	66.7	66.7	100.0	70.0
項目5	1	2	1	1	5
%	33.3	66.7	33.3	100.0	50.0
該当数	3	3	3	1	10

注：% は各項目に答えた数を最下行にある該当数で割ったものである。
　　マルチアンサーのため，% を合計しても 100% にはならない。

latex=TRUE（デフォルト）の場合

> ma(2, 3:7, df) # 二番目の引数がマルチアンサー項目の場合

\begin{table}[htbp]
\begin{center}
\begin{tabular}{lcccccc} \hline
血液型 & 項目1 & 項目2 & 項目3 & 項目4 & 項目5 & 該当数\\ \hline
A & 2 & 1 & 1 & 2 & 1 & 3\\
\% & {\small \textit{66.7}} & {\small \textit{33.3}} & {\small \textit{33.3}} & {\small \textit{66.7}} & {\small \textit{33.3}}\\
B & 1 & 3 & 1 & 2 & 2 & 3\\
\% & {\small \textit{33.3}} & {\small \textit{100.0}} & {\small \textit{33.3}} & {\small \textit{66.7}} & {\small \textit{66.7}}\\
O & 2 & 2 & 1 & 2 & 1 & 3\\
\% & {\small \textit{66.7}} & {\small \textit{66.7}} & {\small \textit{33.3}} & {\small \textit{66.7}} & {\small \textit{33.3}}\\
AB & 1 & 1 & 1 & 1 & 1 & 1\\
\% & {\small \textit{100.0}} & {\small \textit{100.0}} & {\small \textit{100.0}} & {\small \textit{100.0}} & {\small \textit{100.0}}\\\hline
合計 & 6 & 7 & 4 & 7 & 5 & 10\\
\% & {\small \textit{60.0}} & {\small \textit{70.0}} & {\small \textit{40.0}} & {\small \textit{70.0}} & {\small \textit{50.0}}\\\hline
\end{tabular}
\end{center}
\end{table}

> ma(3:7, 2, df) # 一番目の引数がマルチアンサー項目の場合

\begin{table}[htbp]
\begin{center}
\begin{tabular}{lccccc} \hline
& \multicolumn{4}{c}{血液型}\\ \cline{2-5}
 & A & B & O & AB & 合計\\ \hline
項目1 & 2 & 1 & 2 & 1 & 6\\
\% & {\small \textit{66.7}} & {\small \textit{33.3}} & {\small \textit{66.7}} & {\small \textit{100.0}} & {\small \textit{60.0}}\\
項目2 & 1 & 3 & 2 & 1 & 7\\
\% & {\small \textit{33.3}} & {\small \textit{100.0}} & {\small \textit{66.7}} & {\small \textit{100.0}} & {\small \textit{70.0}}\\
項目3 & 1 & 1 & 1 & 1 & 4\\
\% & {\small \textit{33.3}} & {\small \textit{33.3}} & {\small \textit{33.3}} & {\small \textit{100.0}} & {\small \textit{40.0}}\\
項目4 & 2 & 2 & 2 & 1 & 7\\
\% & {\small \textit{66.7}} & {\small \textit{66.7}} & {\small \textit{66.7}} & {\small \textit{100.0}} & {\small \textit{70.0}}\\
項目5 & 1 & 2 & 1 & 1 & 5\\
\% & {\small \textit{33.3}} & {\small \textit{66.7}} & {\small \textit{33.3}} & {\small \textit{100.0}} & {\small \textit{50.0}}\\\hline
該当数 & 3 & 3 & 3 & 1 & 10\\
\hline
\end{tabular}
\end{center}
\end{table}
直前のページへ戻る　　
E-mail to Shigenobu AOKI