BCA & BBA(CA) Data Visualization | Exam Ready Questions + Solutions 💡
Q1. To perform exploratory data analysis (EDA) on with datasets like email data set..
# Import pandas → used to read and analyze data
import pandas as pd
# Import matplotlib → used for basic graphs
import matplotlib.pyplot as plt
# Import seaborn → used for advanced visualization
import seaborn as sns
# Import files → used to upload file in Google Colab
from google.colab import files
# 📂 Upload CSV file
uploaded = files.upload()
# why we use this → to open "Choose File" button and upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# why we use this → to access uploaded file name automatically
# Read CSV file
df = pd.read_csv(file_name)
# why we use this → to load CSV data into dataframe
# Show first 5 rows
print(df.head())
# why we use this → to quickly see sample data
# Show dataset information
print(df.info())
# why we use this → to check data types & null values
# Show statistical summary
print(df.describe())
# why we use this → to get mean, min, max, etc.
# Check missing values
print(df.isnull().sum())
# why we use this → to find missing data in each column
# Histogram
df.hist()
plt.show()
# why we use this → to see data distribution
# Correlation heatmap
sns.heatmap(df.corr(numeric_only=True), annot=True)
plt.show()
# why we use this → to find relationship between columns
print("✅ EDA Completed")
CSV - DOWNLOAD
email_id,sender,subject,message,spam,label
1,abc@gmail.com,Win Money Now,"Congratulations! You have won $1000. Click here",Yes,1
2,friend@gmail.com,Meeting Today,"Hey, let's meet at 5pm today",No,0
3,offer@shopping.com,Big Sale,"Flat 70% discount on all items. Hurry up!",Yes,1
4,boss@company.com,Project Update,"Please send the project report by evening",No,0
5,lottery@win.com,Lottery Winner,"You are selected for lottery prize. Claim now",Yes,1
6,colleague@office.com,Lunch Plan,"Shall we go for lunch together?",No,0
7,bank@secure.com,Account Alert,"Your account is blocked. Verify immediately",Yes,1
8,team@company.com,Team Meeting,"Weekly team meeting at 10 AM",No,0
9,deals@amazon.com,Exclusive Offer,"Buy 1 Get 1 Free on electronics",Yes,1
10,hr@company.com,Interview Schedule,"Your interview is scheduled tomorrow",No,0
Q2. Write a program to perform data analysis and representation on a map using various map data sets with
mouse rollover effect, user interaction etc
# Install folium
!pip install folium
# Import libraries
import pandas as pd
import folium
from google.colab import files
# Upload CSV
uploaded = files.upload()
# Get file name
file_name = list(uploaded.keys())[0]
# Read CSV
df = pd.read_csv(file_name)
# Create map
m = folium.Map(location=[20.5937, 78.9629], zoom_start=5)
# Add markers
for i in range(len(df)):
folium.Marker(
location=[df.loc[i, 'lat'], df.loc[i, 'lon']],
popup=f"{df.loc[i, 'city']} - {df.loc[i, 'value']}",
tooltip=df.loc[i, 'city']
).add_to(m)
# Save map
m.save("map.html")
# ✅ IMPORTANT: Download file
files.download("map.html")
print("✅ Map downloaded successfully!")
CSV - DOWNLOAD
city,lat,lon,value
Pune,18.5204,73.8567,100
Mumbai,19.0760,72.8777,200
Delhi,28.7041,77.1025,150
Bangalore,12.9716,77.5946,180
Hyderabad,17.3850,78.4867,130
Chennai,13.0827,80.2707,170
Q3. Build cartographic visualization for multiple datasets involving various countries of the world.
# Install folium
!pip install folium
# Import libraries
import pandas as pd # used to read CSV data
import folium # used to create maps
from google.colab import files # used to upload/download files
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Read CSV
df = pd.read_csv(file_name)
# why we use this → to load dataset into dataframe
# Show data
print(df.head())
# Create world map
m = folium.Map(location=[20, 0], zoom_start=2)
# why we use this → to show world-level map
# Add markers for each country
for i in range(len(df)):
folium.CircleMarker(
location=[df.loc[i, 'lat'], df.loc[i, 'lon']],
radius=df.loc[i, 'value'] / 50,
# why we use this → size represents data value
popup=f"{df.loc[i, 'country']} - {df.loc[i, 'value']}",
# why we use this → show details on click
tooltip=df.loc[i, 'country'],
# why we use this → show country name on hover
color='blue',
fill=True
).add_to(m)
# Save map
m.save("world_map.html")
# why we use this → save map as HTML
# Download map
files.download("world_map.html")
# why we use this → download map file
print("✅ World Map Created Successfully!")
CSV - DOWNLOAD
country,lat,lon,value
India,20.5937,78.9629,1400
USA,37.0902,-95.7129,331
UK,55.3781,-3.4360,67
Germany,51.1657,10.4515,83
Brazil,-14.2350,-51.9253,213
Japan,36.2048,138.2529,125
Australia,-25.2744,133.7751,26
Canada,56.1304,-106.3468,38
France,46.2276,2.2137,65
China,35.8617,104.1954,1440
Q4. Create basic visualizations using Python.
Instructions:
Import dataset.
Create the following plots using Matplotlib:
Line chart
Bar chart
Pie chart
Histogram
# Import libraries
import pandas as pd # used to read CSV file
import matplotlib.pyplot as plt # used for creating graphs
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Read CSV file
df = pd.read_csv(file_name)
# why we use this → to load dataset into dataframe
# Show data
print(df.head())
# ===============================
# 📈 Line Chart
# ===============================
plt.plot(df['month'], df['sales'])
# why we use this → to show trend over time
plt.title("Sales Trend")
plt.xlabel("Month")
plt.ylabel("Sales")
plt.show()
# ===============================
# 📊 Bar Chart
# ===============================
plt.bar(df['month'], df['profit'])
# why we use this → to compare values
plt.title("Monthly Profit")
plt.xlabel("Month")
plt.ylabel("Profit")
plt.show()
# ===============================
# 🥧 Pie Chart
# ===============================
plt.pie(df['sales'], labels=df['month'], autopct='%1.1f%%')
# why we use this → to show percentage distribution
plt.title("Sales Distribution")
plt.show()
# ===============================
# 📉 Histogram
# ===============================
plt.hist(df['sales'])
# why we use this → to show data distribution
plt.title("Sales Histogram")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()
print("✅ All Visualizations Created Successfully!")
CSV- DOWNLOAD
month,sales,profit
Jan,100,20
Feb,150,30
Mar,200,50
Apr,180,40
May,220,60
Jun,250,70
Q5 Perform Exploratory Data Analysis on the
Wine Quality Dataset.
# Import libraries
import pandas as pd # used to read dataset
import matplotlib.pyplot as plt # used for graphs
import seaborn as sns # used for advanced graphs
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset
df = pd.read_csv(file_name)
# why we use this → to read CSV into dataframe
# Show first rows
print(df.head())
# why we use this → to understand data
# Dataset info
print(df.info())
# why we use this → to check data types & null values
# Statistical summary
print(df.describe())
# why we use this → to get mean, min, max etc.
# ===============================
# 🔍 Analyze key attributes
# ===============================
# Acidity distribution
plt.hist(df['fixed_acidity'])
plt.title("Fixed Acidity Distribution")
plt.xlabel("Acidity")
plt.ylabel("Frequency")
plt.show()
# why we use this → to see acidity distribution
# Sugar vs Alcohol (scatter plot)
plt.scatter(df['residual_sugar'], df['alcohol'])
plt.title("Sugar vs Alcohol")
plt.xlabel("Sugar")
plt.ylabel("Alcohol")
plt.show()
# why we use this → to find relationship
# Alcohol vs Quality
plt.bar(df['quality'], df['alcohol'])
plt.title("Alcohol vs Quality")
plt.xlabel("Quality")
plt.ylabel("Alcohol")
plt.show()
# why we use this → compare alcohol with quality
# ===============================
# 🔗 Correlation Analysis
# ===============================
corr = df.corr(numeric_only=True)
# why we use this → to find relationship between variables
print(corr)
# Heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
# why we use this → visual representation of correlation
print("✅ EDA Completed Successfully!")
CSV - DOWNLOAD
fixed_acidity,volatile_acidity,citric_acid,residual_sugar,alcohol,quality
7.4,0.70,0.00,1.9,9.4,5
7.8,0.88,0.00,2.6,9.8,5
7.8,0.76,0.04,2.3,9.8,5
11.2,0.28,0.56,1.9,9.8,6
7.4,0.70,0.00,1.9,9.4,5
7.9,0.60,0.06,1.6,9.4,5
7.3,0.65,0.00,1.2,10.0,6
7.8,0.58,0.02,2.0,9.5,5
7.5,0.50,0.36,6.1,10.5,6
6.7,0.58,0.08,1.8,9.2,5
Q6 . Load a dataset in Python and display:
First 5 rows
Last 5 rows
Dataset structure
Use Pandas.
# Import pandas
import pandas as pd
# why we use this → to work with dataset (CSV file)
from google.colab import files
# why we use this → to upload file in Colab
# Upload CSV file
uploaded = files.upload()
# why we use this → to select and upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset
df = pd.read_csv(file_name)
# why we use this → to read CSV file into dataframe
# ===============================
# 📊 Display First 5 Rows
# ===============================
print("First 5 Rows:")
print(df.head())
# why we use this → to see starting data
# ===============================
# 📊 Display Last 5 Rows
# ===============================
print("\nLast 5 Rows:")
print(df.tail())
# why we use this → to see ending data
# ===============================
# 📊 Dataset Structure
# ===============================
print("\nDataset Structure:")
print(df.info())
# why we use this → to check columns, data types, null values
id,name,age,marks
1,Amit,18,85
2,Riya,19,90
3,Rahul,18,78
4,Pooja,20,88
5,Neha,19,92
6,Arjun,18,75
7,Sneha,20,89
8,Karan,19,80
Q7. Convert a dataset into a Pandas DataFrame and perform:
Sorting
Filtering
Display summary statistics.
# Import pandas
import pandas as pd
# why we use this → to work with dataset
from google.colab import files
# why we use this → to upload CSV file
# Upload CSV file
uploaded = files.upload()
# why we use this → to select dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Convert dataset into DataFrame
df = pd.read_csv(file_name)
# why we use this → to load CSV into dataframe
# ===============================
# 🔽 Sorting
# ===============================
sorted_df = df.sort_values(by='salary')
# why we use this → to arrange data by salary (ascending)
print("Sorted Data (by salary):")
print(sorted_df)
# ===============================
# 🔍 Filtering
# ===============================
filtered_df = df[df['salary'] > 35000]
# why we use this → to select rows with salary > 35000
print("\nFiltered Data (salary > 35000):")
print(filtered_df)
# ===============================
# 📊 Summary Statistics
# ===============================
summary = df.describe()
# why we use this → to get mean, min, max, etc.
print("\nSummary Statistics:")
print(summary)
CSV - DOWNLAD
id,name,age,salary,department
1,Amit,25,30000,IT
2,Riya,30,40000,HR
3,Rahul,28,35000,Finance
4,Pooja,32,45000,IT
5,Neha,27,38000,HR
6,Arjun,29,42000,Finance
7,Sneha,31,46000,IT
8,Karan,26,32000,HR
Q8. Create the following plots using Python:
# Import libraries
import pandas as pd # used to read dataset
import matplotlib.pyplot as plt # used for graphs
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset
df = pd.read_csv(file_name)
# why we use this → to convert CSV into dataframe
# Show data
print(df.head())
# ===============================
# 📉 Histogram
# ===============================
plt.hist(df['marks'])
# why we use this → to show distribution of marks
plt.title("Marks Distribution")
plt.xlabel("Marks")
plt.ylabel("Frequency")
plt.show()
# ===============================
# 🥧 Pie Chart
# ===============================
plt.pie(df['marks'], labels=df['name'], autopct='%1.1f%%')
# why we use this → to show percentage share
plt.title("Marks Percentage")
plt.show()
print("✅ Plots Created Successfully!")
CSV - DOWNLOAD
name,marks
Amit,85
Riya,90
Rahul,78
Pooja,88
Neha,92
Arjun,75
Sneha,89
Karan,80
Q9 Perform EDA on the Wine Quality Dataset.
Steps:
Load dataset
Display summary
Plot graphs
# Import libraries
import pandas as pd # used to load dataset
import matplotlib.pyplot as plt # used for graphs
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset
df = pd.read_csv(file_name)
# why we use this → to convert CSV into dataframe
# ===============================
# 📊 Display Summary
# ===============================
print("First 5 Rows:")
print(df.head())
# why we use this → to view starting data
print("\nDataset Info:")
print(df.info())
# why we use this → to check structure
print("\nSummary Statistics:")
print(df.describe())
# why we use this → to get mean, min, max
# ===============================
# 📈 Plot Graphs
# ===============================
# Histogram (Alcohol)
plt.hist(df['alcohol'])
plt.title("Alcohol Distribution")
plt.xlabel("Alcohol")
plt.ylabel("Frequency")
plt.show()
# why we use this → to see distribution
# Line Plot (Quality vs Alcohol)
plt.plot(df['quality'], df['alcohol'])
plt.title("Quality vs Alcohol")
plt.xlabel("Quality")
plt.ylabel("Alcohol")
plt.show()
# why we use this → to see trend
# Bar Chart (Quality)
plt.bar(df['quality'], df['alcohol'])
plt.title("Alcohol by Quality")
plt.xlabel("Quality")
plt.ylabel("Alcohol")
plt.show()
# why we use this → to compare values
print("✅ EDA Completed Successfully!")
CSV - DOWNLOAD
fixed_acidity,volatile_acidity,citric_acid,residual_sugar,alcohol,quality
7.4,0.70,0.00,1.9,9.4,5
7.8,0.88,0.00,2.6,9.8,5
7.8,0.76,0.04,2.3,9.8,5
11.2,0.28,0.56,1.9,9.8,6
7.4,0.70,0.00,1.9,9.4,5
7.9,0.60,0.06,1.6,9.4,5
7.3,0.65,0.00,1.2,10.0,6
7.8,0.58,0.02,2.0,9.5,5
7.5,0.50,0.36,6.1,10.5,6
6.7,0.58,0.08,1.8,9.2,5
Q10. Create the following graphs using Python:
# Import libraries
import pandas as pd # used to read dataset
import matplotlib.pyplot as plt # used for graphs
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset
df = pd.read_csv(file_name)
# why we use this → to convert CSV into dataframe
# Show data
print(df.head())
# ===============================
# 📈 Line Graph
# ===============================
plt.plot(df['month'], df['sales'])
# why we use this → to show trend over time
plt.title("Sales Trend")
plt.xlabel("Month")
plt.ylabel("Sales")
plt.show()
# ===============================
# 🔵 Scatter Plot
# ===============================
plt.scatter(df['advertising'], df['sales'])
# why we use this → to show relationship between variables
plt.title("Advertising vs Sales")
plt.xlabel("Advertising")
plt.ylabel("Sales")
plt.show()
print("✅ Graphs Created Successfully!")
month,sales,advertising
Jan,100,20
Feb,150,25
Mar,200,30
Apr,180,28
May,220,35
Jun,250,40
Q11. Create a dataset and perform operations using NumPy.
Operations:
Mean
Median
Standard deviation
# Import libraries
import numpy as np # used for numerical operations
import pandas as pd # used to read CSV
from google.colab import files # used to upload file
# Upload CSV file
uploaded = files.upload()
# why we use this → to upload dataset
# Get file name
file_name = list(uploaded.keys())[0]
# Load dataset using pandas
df = pd.read_csv(file_name)
# why we use this → to read CSV file
# Convert column into NumPy array
data = df['values'].values
# why we use this → to convert data into NumPy format
# ===============================
# 📊 Mean
# ===============================
mean_value = np.mean(data)
# why we use this → to find average
print("Mean:", mean_value)
# ===============================
# 📊 Median
# ===============================
median_value = np.median(data)
# why we use this → to find middle value
print("Median:", median_value)
# ===============================
# 📊 Standard Deviation
# ===============================
std_value = np.std(data)
# why we use this → to measure spread of data
print("Standard Deviation:", std_value)
values
10
20
30
40
50
60
70
80
90
100
0 Comments