Data Analysis using Python(NEP)
CSV FILE ARE GIVEN END!!!
1.Probability
a:Calculating the simple probabilities
import pandas as pd
df=pd.read_csv('train.csv')
probability_event = df['Survived'].value_counts() / len(df['Survived'])
print(probability_event)
[note: use csv file according your choice and change Survived as per the csv file]
b:Applications of Probability distributions to real life problems
(A):Normal Distribution
import numpy as np
import matplotlib.pyplot as plt
mean = 0
std_dev = 1
sample_size = 1000
random_sample = np.random.normal(mean, std_dev, sample_size)
plt.hist(random_sample, bins=30, density=True, alpha=0.75, color='blue')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = (1/(std_dev * np.sqrt(2 * np.pi))) * np.exp(-(x - mean)**2 / (2 * std_dev**2))
plt.plot(x, p, 'k', linewidth=2)
plt.title("Normal Distribution")
plt.xlabel("Value")
plt.ylabel("Probability Density")
plt.show()
(B): Binomial distribution
import numpy as np
n_trials = 20
p_success = 0.5
random_sample = np.random.binomial(n_trials, p_success, 10)
print(random_sample)
2.Test of significance
a:T-Test: one sample, two independent samples and paired
(A): one sample test
from scipy.stats import ttest_ind
import pandas as pd
df=pd.read_csv('StudentsPerformance.csv')
from scipy.stats import ttest_ind
male_scores=df[df['gender']=='male']['math score']
print("one sample")
print(male_scores)
[note: change the values according to your csv file]
(B):two independent samples
from scipy.stats import ttest_ind
import pandas as pd
df=pd.read_csv('StudentsPerformance.csv')
from scipy.stats import ttest_ind
male_scores = df[df['gender'] == 'male']['math score']
female_scores = df[df['gender'] == 'female']['math score']
t_statistic, p_value = ttest_ind(male_scores, female_scores)
print("two independent")
print(f'T-Statistic: {t_statistic}')
print(f'P-Value: {p_value}')
b:ANOVA & Chi-Square Test
(A):ANOVA
import pandas as pd
import scipy.stats as stats
data_anova = pd.read_csv('data_anova.csv')
groups = data_anova['group']
values = data_anova['value']
f_statistic, p_value = stats.f_oneway(values[groups == 'A'], values[groups == 'B'], values[groups == 'C'])
print("annova test")
print("F-Statistic:", f_statistic)
print("P-Value:", p_value)
(B):Chi-Square Test
import pandas as pd
import scipy.stats as stats
data_chi2 = pd.read_csv('data_chi2.csv')
observed_values = pd.crosstab(data_chi2['category1'], data_chi2['category2'])
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed_values)
print("chi-square")
print("Chi-Square Statistic:", chi2_stat)
print("P-Value:", p_value)
[note: change the values according to your csv file]
3.Correlation and Regression analysis
a.Scattered diagram, calculating of correlation coefficient
import numpy as np
import matplotlib.pyplot as plt
x_data = np.random.rand(50)
y_data = 2 * x_data + 1 + 0.1 * np.random.randn(50)
plt.scatter(x_data, y_data, color='blue', marker='o', label='Data Points')
correlation_coefficient = np.corrcoef(x_data, y_data)[0, 1]
print(f"Correlation Coefficient: {correlation_coefficient:.4f}")
plt.title("Scatter Plot with Correlation Coefficient")
plt.xlabel("X Data")
plt.ylabel("Y Data")
plt.legend()
plt.grid(True)
plt.show()
b.Linear regression: fitting, testing model adequacy and prediction (simple and
multiple)
(A):simple
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)
plt.scatter(X_test, y_test, color='blue', label='Actual Data')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Simple Linear Regression')
plt.legend()
plt.show()
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
(B):multipe
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
np.random.seed(0)
X = np.random.rand(100, 3)
y = 5 + 3 * X[:, 0] + 2 * X[:, 1] - 4 * X[:, 2] + np.random.randn(100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
plt.scatter(X_test[:, 0], y_test, color='blue', label='Actual Data')
plt.scatter(X_test[:, 0], y_pred, color='red', label='Predicted Data')
plt.xlabel('Feature 1')
plt.ylabel('Target')
plt.title('Multiple Linear Regression')
plt.legend()
plt.show()
c.Fitting of logistic regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.exceptions import ConvergenceWarning
import warnings
df = pd.read_csv('train.csv')
X = df[['PassengerId', 'Survived']]
y = LabelEncoder().fit_transform(df['Name'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=ConvergenceWarning)
model = LogisticRegression(max_iter=1000).fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)
print(f'Accuracy: {accuracy:.2f}\nConfusion Matrix:\n{conf_matrix}\nClassification Report:\n{classification_rep}')
[Note : change the values according to the csv file]
[All the csv file used for the above code is here :https://mega.nz/folder/twwgABTS#E27XzWV-gKSxZkB7waIQbg]
Ohhh bhaaiiiii....👍 Helpfull....
ReplyDeleteyour welcome!!!
DeleteThis comment has been removed by a blog administrator.
ReplyDelete