# CHi square test, PCA, logistic regression

setwd("D:/dane_pulpit_2_12/dydaktyka/Analzia_danych_przemysłowych/PCA_RL/data")

read.csv("Data_new_SM.csv", header = T, sep = ";")

#Then, the following data from the process were collected: 
#x1 – production worker,  x2 – operation time, x3 – week day, 
#x4 – check points of AM, x5 – range of AM, x6 – AM duration time, 
#x7 – PM timeliness, x8 - employee performing PM, x9 –reported problems after AM, 
#x10 – reported problems after PM. 
#Output was defined as: CTQ (Y) was defined on two levels: 
#as OK - a machine’s availability in the required time, 
#and NOK - a machine’s unavailability in the required time.

# Importing the dataset
dd = read.csv('Data_new_SM.csv', header = T, sep = ";")


summary(dd)


library(ggplot2)

sapply(dd, class)
ggplot(dd) +
  geom_boxplot(aes(x=Y, y=X1, color=as.factor(Y)))
ggplot(dd) + geom_boxplot(aes(x=X1, y=X2, color=as.factor(Y)))
ggplot(dd) + geom_boxplot(aes(x=Y, y=X4, color=as.factor(X3)))
ggplot(dd) + geom_boxplot(aes(x=Y, y=X6, color=as.factor(Y)))
ggplot(dd) + geom_boxplot(aes(x=Y, y=X1, color=as.factor(X9)))


#Chi square test

chisq.test(dd$X1, dd$Y, correct=TRUE)
chisq.test(dd$X2, dd$Y, correct=TRUE)
chisq.test(dd$X3, dd$Y, correct=FALSE)
chisq.test(dd$X4, dd$Y, correct=FALSE)
chisq.test(dd$X5, dd$Y, correct=FALSE)
chisq.test(dd$X7, dd$Y, correct=FALSE)
chisq.test(dd$X8, dd$Y, correct=FALSE)
chisq.test(dd$X9, dd$Y, correct=FALSE)
chisq.test(dd$X10, dd$Y, correct=FALSE)

#Principle Component Analyzes

pca <- prcomp(dd[,-11], scale. = T)
pca


library(factoextra)
library(tidyverse)
summary(pca)

fviz_screeplot(pca, addlabels=T)


write.csv2(pca$x, file = 'substitute_var_1.csv') 

dd1 = read.csv('substitute_var_1.csv', header = T, sep = ";")


#Dependencies_between_PCA_components

pca$x |> 
  as.data.frame() |> 
  ggplot(aes(x = PC1, y = PC2)) +
  geom_point()


pca$x |> 
  as.data.frame() |> 
  ggplot(aes(x = PC2, y = PC4)) +
  geom_point()

pca$x |> 
  as.data.frame()|> boxplot(aes(x=PC2, y=PC4, color=as.factor(Y)))

pca$x |> 
  as.data.frame()|> boxplot(aes(x=PC2, y=PC3, color=as.factor(Y)))

# or another way
fviz_pca(pca)

library(ggplot2)
library(FactoMineR)

Y_obserw <- dd[,11]

fviz_pca_biplot(pca, geom = "point", habillage = Y_obserw, repel = "TRUE") 


#Other solution!!!!

setwd("D:/dane_pulpit_2_12/dydaktyka/Analzia_danych_przemysłowych/PCA_RL/data")

read.csv("Data_new_SM.csv", header = T, sep = ";")


#PCA with use of  FactoMineR

library(FactoMineR)
pca <- PCA(dd[,-11], ncp = 4)

plot(pca)

summary(pca)





# Logistic regression 

library(ISLR)

# Importing the dataset

#Outpiut_values_!!!

dd1$y

setwd("F:/projekt_BIP_2023/guimaraes_25-29_09/data")

read.csv("substitute_var_1_y.csv", header = T, sep = ";", dec = ",")

dd2 = read.csv("substitute_var_1_y.csv", header = T, sep = ";", dec = ",")

str(dd2)

dd2$Y <- as.factor(dd2$Y)
str(dd2)

#Wykresy

ggplot(dd2) +
  geom_point(aes(x=PC4, y=PC1, color=as.factor(Y)))

ggplot(dd2) +
  geom_boxplot(aes(x=Y, y=PC1, color=as.factor(Y)))

ggplot(dd2) +
  geom_boxplot(aes(x=Y, y=PC4, color=as.factor(Y)))

ggplot(dd2) +
  geom_boxplot(aes(x=Y, y=PC2, color=as.factor(Y)))

#Data divided: train and test set 

set.seed(1234)

ind <- sample(2, nrow(dd2), replace = T, prob = c(0.8, 0.2))

train <- dd2[ind==1,]
test<- dd2[ind==2,]


#Logistic regression

mymodel<- glm(Y ~ PC1 + PC4, data = dd2, family = 'binomial')

mymodel

summary(mymodel)

mymodel<- glm(Y ~ PC1 + PC2 +PC4, data = dd2, family = 'binomial')

mymodel

summary(mymodel)

mymodel1<- glm(Y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7, data = dd2, family = 'binomial')


mymodel1


summary(mymodel1)

mymodel2<- glm(Y ~ PC1 + PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial')

mymodel2

summary(mymodel2)

mymodel3<- glm(Y ~ PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial')

mymodel3

summary(mymodel3)

mymodel4<- glm(Y ~ PC2 + PC3 + PC4 + PC6, data = dd2, family = 'binomial')

mymodel4

summary(mymodel4)

#Prediction

p1 <- predict(mymodel, dd2, type = 'response')

head(p1)

head(dd2)

p2 <- predict(mymodel1, dd2, type = 'response')

head(p2)

head(dd2)

p3 <- predict(mymodel1, test, type = 'response')

head(p3)

head(dd2)