# CHi square test, PCA, logistic regression setwd("D:/dane_pulpit_2_12/dydaktyka/Analzia_danych_przemysłowych/PCA_RL/data") read.csv("Data_new_SM.csv", header = T, sep = ";") #Then, the following data from the process were collected: #x1 – production worker, x2 – operation time, x3 – week day, #x4 – check points of AM, x5 – range of AM, x6 – AM duration time, #x7 – PM timeliness, x8 - employee performing PM, x9 –reported problems after AM, #x10 – reported problems after PM. #Output was defined as: CTQ (Y) was defined on two levels: #as OK - a machine’s availability in the required time, #and NOK - a machine’s unavailability in the required time. # Importing the dataset dd = read.csv('Data_new_SM.csv', header = T, sep = ";") summary(dd) library(ggplot2) sapply(dd, class) ggplot(dd) + geom_boxplot(aes(x=Y, y=X1, color=as.factor(Y))) ggplot(dd) + geom_boxplot(aes(x=X1, y=X2, color=as.factor(Y))) ggplot(dd) + geom_boxplot(aes(x=Y, y=X4, color=as.factor(X3))) ggplot(dd) + geom_boxplot(aes(x=Y, y=X6, color=as.factor(Y))) ggplot(dd) + geom_boxplot(aes(x=Y, y=X1, color=as.factor(X9))) #Chi square test chisq.test(dd$X1, dd$Y, correct=TRUE) chisq.test(dd$X2, dd$Y, correct=TRUE) chisq.test(dd$X3, dd$Y, correct=FALSE) chisq.test(dd$X4, dd$Y, correct=FALSE) chisq.test(dd$X5, dd$Y, correct=FALSE) chisq.test(dd$X7, dd$Y, correct=FALSE) chisq.test(dd$X8, dd$Y, correct=FALSE) chisq.test(dd$X9, dd$Y, correct=FALSE) chisq.test(dd$X10, dd$Y, correct=FALSE) #Principle Component Analyzes pca <- prcomp(dd[,-11], scale. = T) pca library(factoextra) library(tidyverse) summary(pca) fviz_screeplot(pca, addlabels=T) write.csv2(pca$x, file = 'substitute_var_1.csv') dd1 = read.csv('substitute_var_1.csv', header = T, sep = ";") #Dependencies_between_PCA_components pca$x |> as.data.frame() |> ggplot(aes(x = PC1, y = PC2)) + geom_point() pca$x |> as.data.frame() |> ggplot(aes(x = PC2, y = PC4)) + geom_point() pca$x |> as.data.frame()|> boxplot(aes(x=PC2, y=PC4, color=as.factor(Y))) pca$x |> as.data.frame()|> boxplot(aes(x=PC2, y=PC3, color=as.factor(Y))) # or another way fviz_pca(pca) library(ggplot2) library(FactoMineR) Y_obserw <- dd[,11] fviz_pca_biplot(pca, geom = "point", habillage = Y_obserw, repel = "TRUE") #Other solution!!!! setwd("D:/dane_pulpit_2_12/dydaktyka/Analzia_danych_przemysłowych/PCA_RL/data") read.csv("Data_new_SM.csv", header = T, sep = ";") #PCA with use of FactoMineR library(FactoMineR) pca <- PCA(dd[,-11], ncp = 4) plot(pca) summary(pca) # Logistic regression library(ISLR) # Importing the dataset #Outpiut_values_!!! dd1$y setwd("F:/projekt_BIP_2023/guimaraes_25-29_09/data") read.csv("substitute_var_1_y.csv", header = T, sep = ";", dec = ",") dd2 = read.csv("substitute_var_1_y.csv", header = T, sep = ";", dec = ",") str(dd2) dd2$Y <- as.factor(dd2$Y) str(dd2) #Wykresy ggplot(dd2) + geom_point(aes(x=PC4, y=PC1, color=as.factor(Y))) ggplot(dd2) + geom_boxplot(aes(x=Y, y=PC1, color=as.factor(Y))) ggplot(dd2) + geom_boxplot(aes(x=Y, y=PC4, color=as.factor(Y))) ggplot(dd2) + geom_boxplot(aes(x=Y, y=PC2, color=as.factor(Y))) #Data divided: train and test set set.seed(1234) ind <- sample(2, nrow(dd2), replace = T, prob = c(0.8, 0.2)) train <- dd2[ind==1,] test<- dd2[ind==2,] #Logistic regression mymodel<- glm(Y ~ PC1 + PC4, data = dd2, family = 'binomial') mymodel summary(mymodel) mymodel<- glm(Y ~ PC1 + PC2 +PC4, data = dd2, family = 'binomial') mymodel summary(mymodel) mymodel1<- glm(Y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7, data = dd2, family = 'binomial') mymodel1 summary(mymodel1) mymodel2<- glm(Y ~ PC1 + PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial') mymodel2 summary(mymodel2) mymodel3<- glm(Y ~ PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial') mymodel3 summary(mymodel3) mymodel4<- glm(Y ~ PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial') mymodel4 summary(mymodel4) #Prediction p1 <- predict(mymodel, dd2, type = 'response') head(p1) head(dd2) p2 <- predict(mymodel1, dd2, type = 'response') head(p2) head(dd2) p3 <- predict(mymodel1, test, type = 'response') head(p3) head(dd2)