In [48]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
df = pd.read_csv('student_sleep_patterns.csv')
df
Out[48]:
| Student_ID | Age | Gender | University_Year | Sleep_Duration | Study_Hours | Screen_Time | Caffeine_Intake | Physical_Activity | Sleep_Quality | Weekday_Sleep_Start | Weekend_Sleep_Start | Weekday_Sleep_End | Weekend_Sleep_End | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 24 | Other | 2nd Year | 7.7 | 7.9 | 3.4 | 2 | 37 | 10 | 14.16 | 4.05 | 7.41 | 7.06 |
| 1 | 2 | 21 | Male | 1st Year | 6.3 | 6.0 | 1.9 | 5 | 74 | 2 | 8.73 | 7.10 | 8.21 | 10.21 |
| 2 | 3 | 22 | Male | 4th Year | 5.1 | 6.7 | 3.9 | 5 | 53 | 5 | 20.00 | 20.47 | 6.88 | 10.92 |
| 3 | 4 | 24 | Other | 4th Year | 6.3 | 8.6 | 2.8 | 4 | 55 | 9 | 19.82 | 4.08 | 6.69 | 9.42 |
| 4 | 5 | 20 | Male | 4th Year | 4.7 | 2.7 | 2.7 | 0 | 85 | 3 | 20.98 | 6.12 | 8.98 | 9.01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 495 | 496 | 24 | Male | 2nd Year | 5.1 | 9.3 | 1.9 | 4 | 110 | 4 | 17.42 | 8.43 | 6.93 | 10.78 |
| 496 | 497 | 20 | Male | 2nd Year | 8.9 | 7.7 | 3.5 | 3 | 40 | 4 | 1.22 | 15.54 | 5.85 | 7.23 |
| 497 | 498 | 21 | Male | 3rd Year | 5.7 | 6.4 | 3.9 | 1 | 68 | 10 | 9.94 | 2.25 | 5.46 | 10.72 |
| 498 | 499 | 18 | Female | 2nd Year | 4.9 | 0.5 | 3.5 | 0 | 12 | 2 | 19.10 | 15.49 | 8.35 | 7.20 |
| 499 | 500 | 21 | Male | 3rd Year | 7.9 | 11.6 | 1.0 | 0 | 86 | 1 | 7.54 | 14.12 | 7.01 | 9.19 |
500 rows × 14 columns
In [49]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Student_ID 500 non-null int64 1 Age 500 non-null int64 2 Gender 500 non-null object 3 University_Year 500 non-null object 4 Sleep_Duration 500 non-null float64 5 Study_Hours 500 non-null float64 6 Screen_Time 500 non-null float64 7 Caffeine_Intake 500 non-null int64 8 Physical_Activity 500 non-null int64 9 Sleep_Quality 500 non-null int64 10 Weekday_Sleep_Start 500 non-null float64 11 Weekend_Sleep_Start 500 non-null float64 12 Weekday_Sleep_End 500 non-null float64 13 Weekend_Sleep_End 500 non-null float64 dtypes: float64(7), int64(5), object(2) memory usage: 54.8+ KB
In [50]:
df.shape
Out[50]:
(500, 14)
In [68]:
columns = ['University_Year', 'Screen_Time', 'Caffeine_Intake',]
for col in columns:
x = df.groupby([col])['Sleep_Quality'].mean().reset_index()
sns.lineplot(x=col, y='Sleep_Quality', data=x)
plt.title(f'Sleep Quality by {col}')
plt.show()
In [52]:
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
normalized_X = normalize(scaled_X)
normalized_X = pd.DataFrame(normalized_X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(normalized_X)
X_pca = pd.DataFrame(X_pca)
X_pca.columns = ['Caffeine_Intake', 'Study_Hours']
plt.figure(figsize=(10, 6), dpi=200)
plt.scatter(X_pca['Caffeine_Intake'], X_pca['Study_Hours'])
plt.xlabel('Caffeine_Intake')
plt.ylabel('Study_Hours')
plt.title('PCA: Caffeine Intake vs Study Hours')
plt.grid(True, alpha=0.3)
plt.show()
In [67]:
plt.scatter(df['Sleep_Quality'], df['Screen_Time'])
plt.xlabel('Sleep_Quality')
plt.ylabel('Screen_Time')
plt.title('Sleep_Quality vs Screen_Time')
plt.grid(True, alpha=0.3)
plt.show()
In [69]:
plt.scatter(df['Sleep_Quality'], df['Physical_Activity'])
plt.xlabel('Sleep_Quality')
plt.ylabel('Physical_Activity')
plt.title('PCA: Sleep_Quality vs Physical_Activity')
plt.grid(True, alpha=0.3)
plt.show()
In [ ]: