Useful Functions in Python
Recommendation: 【Python】 Python index, 【Bioinformatics】 Bioinformatics analysis index
1. Overview
2. Basics
7. Data Science
10. Bioinformatics
c. A collection of main functions useful in R
1. Overview
⑴ Functions defined below can be called as follows. Calling .py from GitHub to python.
### Reference : https://changhsinlee.com/colab-import-python/
import requests
# Save datagenerators as file to colab working directory
# If you are using GitHub, make sure you get the "Raw" version of the code
url = 'https://github.com/JB243/nate9389/blob/main/Python/A_Collection_of_Useful_Functions_in_Python.py?raw=true'
r = requests.get(url)
# make sure your filename is the same as how you want to import
with open('A_Collection_of_Useful_Functions_in_Python.py', 'w') as f:
f.write(r.text)
# now we can import
from A_Collection_of_Useful_Functions_in_Python import SUM
### Warning
# Only SUM is implemented, but it seems to be implemented only at the beginning
SUM(2,6)
⑵ Functions defined below can be used as follows. Python uses ‘def’ when defining a function, like C language.
def SUM(a, b):
return a+b
SUM(1,2)
# 3
2. Basics
⑴ Code that you must remember
① Type check:
type(x)
② Length:
len(x)
③ Dimension (e.g.,
numpy.ndarray
,cv2.imread
):x.shape
④ Dimension (e.g.,
PIL.PngImagePlugin.PngImageFile
,PIL.Image.open
):x.size
⑤ Casting into
int
:.astype(int)
⑥ A set of unique elements in a given array:
.unique()
⑦ Wait for x seconds:
time.sleep(x)
⑧ An easy way to create a for loop with an index when you have a list:
for index, e in enumerate(my_list): ...
⑨ Split the given sentence by ‘_’ and take only the front part:
myStr.split('')[0]
⑵ A function that fills the natural number with zero to make it a total of five digits
n = 55
idx = str(n).zfill(5)
⑶ All lowercase letters or all uppercase letters for a given string
_str = "ABCdef"
_str.lower()
# 'abcdef'
_str.upper()
# 'ABCDEF'
⑷ Whether the given string includes a partial string
a = "ABCD"
a.startswith('AB')
# TRUE
''' if a is not string object
a.str.startswith('AB')
# TRUE
'''
⑸ Whether a given string ends with a specific string
a = "ABCD"
a.endswith('CD')
# TRUE
⑹ Whether the given string (give_str) contains a specific string (partial_str)
def is_contain(given_str, partial_str):
s = ''
for char in given_str:
s = s + char
if s.endswith(partial_str):
return True
return False
⑺ Function that removes the first few characters from a given string (str) and returns them
def str_omit_forward(_str, n):
return _str[n:]
⑻ A function that removes the last few characters from a given string (str) and returns them
def str_omit_backward(_str, n):
return _str[:-n]
⑼ Whether a given element (e) is included in the list (l)
def is_element_in_list(e, l):
for i in range(len(l)):
if e == l[i]:
return True
return False
⑽ Function that outputs which element (e) is in the list (l)
def where_element_in_list(e, l):
for i in range(len(l)):
if e == l[i]:
return i
return -1
⑾ Function that outputs the intersection of two lists
def is_element_in_list(e, l):
for i in range(len(l)):
if e == l[i]:
return True
return False
def my_intersection(l1, l2):
l = []
flag = 0
for i in range(len(l1)):
if is_element_in_list(l1[i], l2):
l[flag] = l1[i]
flag = flag + 1
return l
⑿ Date notation conversion function: It converts ‘Mar 02, 2018’ to 20180301, as an example.
def convert_date(_str):
def month_to_num(_str):
if _str == 'Jan':
return '01'
if _str == 'Feb':
return '02'
if _str == 'Mar':
return '03'
if _str == 'Apr':
return '04'
if _str == 'May':
return '05'
if _str == 'Jun':
return '06'
if _str == 'Jul':
return '07'
if _str == 'Aug':
return '08'
if _str == 'Sep':
return '09'
if _str == 'Oct':
return '10'
if _str == 'Nov':
return '11'
if _str == 'Dec':
return '12'
flag = 0
s1 = '' # Mar
s2 = '' # 01
s3 = '' # 2018
for char in _str:
## closing
if char == ' ' and flag == 0:
flag = 1
elif char == ',' and flag == 1:
flag = 2
elif char == ' ' and flag == 2:
flag = 3
## propagation
elif flag == 0:
s1 = s1 + char
elif flag == 1:
s2 = s2 + char
elif flag == 3:
s3 = s3 + char
return str(s3) + str(month_to_num(s1)) + str(s2)
⒀ Creating a data frame with two lists
import pandas as pd
v1 = []
v2 = []
...
df = pd.DataFrame(
{'Column1': v1,
'Column2': v2
})
⒁ Getting the source code of a function
import inspect
lines = inspect.getsource(MyFunc)
print(lines)
⒂ Receive variable names as strings and dynamically utilize them
def set_variable_from_string(variable_name, value):
globals()[variable_name] = value
def get_variable_value_from_string(variable_name):
return globals().get(variable_name, None)
# Set variable name to string
var_name = "sample_variable"
# Use string to create variables
set_variable_from_string(var_name, 12345)
# Get value of variable using string
value = get_variable_value_from_string(var_name)
print(f"{var_name} = {value}")
3. Mathematics Functions
⑴ Code to obtain the greatest common divisor
def gcd(x: int, y: int) -> int:
while y:
x, y = y, x % y
return x
4. Fild Input and Output
⑴ csv file read
import pandas as pd
x = pd.read_csv("FILE.csv", header=None, names= ['barcodes','tissue','row','col','imgrow','imgcol'])
⑵ Text file read
import csv
import sys
def read_txt(txt_dir):
l = []
f = open(txt_dir, 'r', encoding = 'utf-8')
for line in csv.reader(f):
### if you want elements of list ###
l.append(line)
### if you want elements of str ###
# l.append(''.join(line))
f.close()
return l
⑶ Switch to a data frame and write to a .csv: Use iloc to get the exact rows and columns from pandas.DataFrame
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
pandas.DataFrame(MY_VARIABLE).to_csv(FILE_PATH)
⑷ Read all .csv files in the directory
import glob
print(glob.glob("./dir/*.csv"))
⑸ Creating, reading, and grouping HDF (.h5) files
⑹ Website crawling (Reference)
import io
import requests
import pandas as pd
url = 'https://typeset.io/search?q=Microbiome'
github_session = requests.Session()
download = github_session.get(url).content
print(download)
⑺ Merging PDF files (ref)
from PyPDF2 import PdfWriter
merger = PdfWriter()
for pdf in ['pdf_1.pdf', 'pdf_2.pdf', 'pdf_3.pdf']:
merger.append(pdf)
merger.write(target + ".pdf")
merger.close()
⑻ HTML to MarkDown
!pip install html2text
import html2text
def html_to_markdown(html_content):
# Create an html2text converter object
h = html2text.HTML2Text()
h.ignore_links = False
# Convert the HTML to Markdown
markdown_content = h.handle(html_content)
return markdown_content
# Example Usage
html_code = """
<h1>Hello World</h1>
<p>This is a paragraph. <a href="https://example.com">Here is a link</a>.</p>
"""
markdown_code = html_to_markdown(html_code)
print(markdown_code)
⑼ MarkDown to HTML
import markdown
def md_to_html(md_file_path, html_file_path):
"""
Convert a Markdown file to an HTML file.
:param md_file_path: Path to the input Markdown file.
:param html_file_path: Path to the output HTML file.
"""
# Read the Markdown file
with open(md_file_path, 'r', encoding='utf-8') as file:
md_content = file.read()
# Convert Markdown to HTML
html_content = markdown.markdown(md_content)
# Write the HTML content to a file
with open(html_file_path, 'w', encoding='utf-8') as file:
file.write(html_content)
# Example usage
md_to_html('example.md', 'example.html')
⑽ Merge all PDF files in the given folder (in alphabetical order)
import os
import glob
import fitz # PyMuPDF library
def merge_pdfs(input_folder, output_path):
# input_folder: Directory containing PDF files
# output_path: Path for the output PDF file
# Retrieve list of PDF files from the input folder
pdf_files = glob.glob(os.path.join(input_folder, '*.pdf'))
print(pdf_files)
# Sort in alphabetical order
pdf_files.sort()
# Create PyMuPDF's PdfWriter object
pdf_writer = fitz.open()
# Merge all PDF files
for pdf_file in pdf_files:
pdf_document = fitz.open(pdf_file)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pdf_writer.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
# Save the output PDF file
pdf_writer.save(output_path)
pdf_writer.close()
# Print completion message for file merging
print(f'PDF files have been merged into {output_path}.')
# Set the input folder and output file path
input_folder = r'D:/1'
output_path = r'D:/output.pdf'
# Execute PDF file merging
merge_pdfs(input_folder, output_path)
⑾ Collect all download links from a specific HTML document and then download the files
import requests
from bs4 import BeautifulSoup
import os
def download_file(url, local_filename):
"""Function to download a file from a given URL."""
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return local_filename
# Read the modified HTML file
with open('modified_html_file.html', 'r') as file:
html_content = file.read()
# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Find all 'a' tags with href attributes
links = soup.find_all('a', href=True)
# Folder to save files
save_folder = 'out' ## You must make this folder
os.makedirs(save_folder, exist_ok=True)
# Download each file
for link in links:
url = link['href']
filename = os.path.join(save_folder, url.split('/')[-1])
print(f"Downloading {url}...")
download_file(url, filename)
print("Download completed.")
⑿ Code to split a GIF file into multiple PNG files.
from PIL import Image
import os
def gif_to_png(gif_filename, output_folder):
# Ensure the output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Open the GIF file
with Image.open(gif_filename) as img:
# Loop over each frame in the GIF
for frame in range(img.n_frames):
# Select the frame
img.seek(frame)
# Save the frame as a PNG file
frame_filename = f"{output_folder}/frame_{frame}.png"
img.save(frame_filename)
print(f"Saved {frame_filename}")
# Example usage
gif_filename = 'img.gif' # Replace with your GIF file path
output_folder = 'out' # Folder where PNG files will be saved
gif_to_png(gif_filename, output_folder)
⒀ Code to output the directory tree structure
import os
def print_tree(directory, prefix=''):
"""Recursively prints a tree diagram of the directory structure."""
files = sorted(os.listdir(directory))
for i, file in enumerate(files):
path = os.path.join(directory, file)
is_last = i == (len(files) - 1)
print(f"{prefix}{'└── ' if is_last else '├── '}{file}")
if os.path.isdir(path):
# Add custom annotations here if file/directory matches certain criteria
new_prefix = prefix + (' ' if is_last else '│ ')
print_tree(path, new_prefix)
# Replace '/path/to/directory' with the actual directory path you want to visualize
print_tree('/path/to/directory')
⒁ Split a PDF file into separate PDF files for each page
import PyPDF2
def split_pdf_pages(input_pdf_path, output_folder):
# Open the PDF file
with open(input_pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# Save each page as an independent PDF file
for i in range(num_pages):
writer = PyPDF2.PdfWriter()
writer.add_page(reader.pages[i])
output_filename = f"{output_folder}/page_{i+1}.pdf"
with open(output_filename, "wb") as output_file:
writer.write(output_file)
print(f"Saved: {output_filename}")
# Example usage of the function
split_pdf_pages("path_to_your_input.pdf", "output_directory")
5. Data Structure
⑴ Overview
① Array
③ Stack
④ Queue
⑤ Tree
⑥ Graph
⑦ Hash
⑧ Set
⑵ Counting Island Algorithm: A function that counts the number of islands when 1 is land and 0 is water in the world, which is a list of M × N
def counting_island(world: list)->int:
class Node():
def __init__(self, i, j):
self.i = i
self.j = j
class undirected_graph():
def __init__(self, V:list, E:list)->None:
self.V = V[:]
self.neighbor = {}
for v in V:
self.neighbor[v] = []
for (v,w) in E:
self.neighbor[v].append(w)
def DFT_preorder(self)->int:
count = 0
if self.V:
visited = {}
for v in self.V:
visited[v]=False
for v in self.V:
if not visited[v]:
count += 1
self.__DFT__preorderHelp(visited, v)
return count
def __DFT__preorderHelp(self, visited: list, v: int)->None:
if not visited[v]:
visited[v] = True
for w in self.neighbor[v]:
self.__DFT__preorderHelp(visited, w)
V = []
E = []
for i in range(len(world)):
for j in range(len(world[0])):
if world[i][j] == 1:
V.append(Node(i, j))
for v in V:
for w in V:
if w.i == v.i and w.j == v.j - 1:
E.append((v,w))
if w.i == v.i and w.j == v.j + 1:
E.append((v,w))
if w.i == v.i - 1 and w.j == v.j:
E.append((v,w))
if w.i == v.i + 1 and w.j == v.j:
E.append((v,w))
g = undirected_graph(V, E)
return g.DFT_preorder()
world1 = [[1,1,1,1,0], [1,0,0,1,0], [1,1,0,1,0], [1,1,0,0,0]]
counting_island(world1) # 1
world2 = [[1,1,0,0,0], [1,1,0,0,0], [0,0,1,1,0], [0,0,0,0,1]]
counting_island(world2) # 3
world3 = [[0,1,0,1,0,1]]
counting_island(world3) # 3
6. Image Processing
⑴ Basic graph generation codes
① A function that draws a scatter plot given x, y
plt.scatter(x, y, alpha = 0.01)
plt.xlabel('brightness of img1')
plt.ylabel('brightness of img2')
② A function that draws a histogram of a given image
import numpy as np
import cv2
import matplotlib.pyplot as plt
img = cv2.imread('image.png')
fig, axes = plt.subplots(1, 1)
axes.hist(img.ravel(), bins=20)
axes.set_title('Histogram')
③ Code for drawing a 4-parameter logistic (4PL) curve: used a lot in pharmacology’s drug-response curve.
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
# 4PL Model Definition
def four_param_logistic(x, A, B, C, D):
return A + (B - A) / (1 + (x / C)**D)
# Example Data
x_data = np.linspace(1, 100, 100)
y_data = four_param_logistic(x_data, 0.5, 10, 50, 1.5) + np.random.normal(size=len(x_data), scale=0.5)
# Parameter Assumption
popt, pcov = curve_fit(four_param_logistic, x_data, y_data, p0=[1, 1, 1, 1])
# Model Prediction based on Assumed Parameters
x_fit = np.linspace(1, 100, 400)
y_fit = four_param_logistic(x_fit, *popt)
# Plotting Results
plt.figure(figsize=(10, 5))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_fit, y_fit, color='red', label='4PL fit')
plt.title('4-Parameter Logistic Fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()
⑵ Image Save
① Example of saving an image as a png (ref)
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(5, 5))
fig.patch.set_visible(False)
plt.imshow(demask_image_t[0], interpolation='nearest')
plt.axis('off')
plt.savefig('foo.png', bbox_inches='tight', dpi = 300)
② Example of saving an image as a pdf
...
plt.show()
fig.savefig('foo.pdf', bbox_inches='tight', dpi = 300)
⑶ cv2 image RGB color rearrangement
import cv2
d = cv2.imread("/home/data/ST/WuSZ_2021_NG_TNBC/GSE/1160920F_spatial/spatial/tissue_hires_image.png")
d_array = cv2.cvtColor(d, cv2.COLOR_BGR2RGB)
plt.imshow(d_array)
⑷ cv2 image resize: For example, if img is (1024, 1024, 3), img_re is (512, 512, 3).
img_re = cv2.resize(img, dsize = (512, 512))
⑸ Conversion of an RGB image to a grayscale image
## version 1.
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
## version 2
import numpy as np
def RGBtoGray(img):
out = np.empty((img.shape[0], img.shape[1]))
for i in range(img.shape[0]):
for j in range(img.shape[1]):
out[i:i+1, j:j+1] = ( 0.2989*img[i:i+1, j:j+1, 0:1] +
0.5870*img[i:i+1, j:j+1, 1:2] +
0.1140*img[i:i+1, j:j+1, 2:3]
).item()
return out
⑹ A function that modifies a red image (1 × W × H) from a 3-channel RGB image to a two-dimensional image (W × H)
img_ = img[:,:,0:1].reshape(img.shape[0], img.shape[1])
⑺ A function that replaces one value with another in an image
def recolor(img, pre, post):
for i in range(img.shape[0]):
for j in range(img.shape[1]):
if img[i:i+1, j:j+1] == pre:
img[i:i+1, j:j+1] = post
return(img)
⑻ Change numpy.ndarray of integers to numpy.ndarray of real numbers (reference)
import cv2
import numpy as np
import matplotlib.pyplot as plt
img = cv2.imread('img.png')
img_cvt = np.array(img[:,:,:], dtype=np.float64)
⑼ Correlation between red and green in one image
import cv2
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import numpy as np
import scipy.stats
def two_image_correlation_RG(img1_dir):
img1 = cv2.imread(img1_dir) # RGB image
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
l_img1 = []
l_img2 = []
for i in range(img1.shape[0]):
for j in range(img1.shape[1]):
l_img1.append((
img1[i:i+1, j:j+1, 0:1]
).item())
l_img2.append((
img1[i:i+1, j:j+1, 1:2]
).item())
print("brightness of img1")
print(np.mean(l_img1))
print("brightness of img2")
print(np.mean(l_img2))
print("img1-img2 correlation")
print(scipy.stats.pearsonr(l_img1, l_img2) )
plt.scatter(l_img1, l_img2, alpha = 0.01)
plt.xlabel('brightness of img1')
plt.ylabel('brightness of img2')
⑽ Correlation between two images
import cv2
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import numpy as np
import scipy.stats
def two_image_correlation(img1_dir, img2_dir):
# img1 and img2 should be same in size
img1 = cv2.imread(img1_dir) # RGB image
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
img2 = cv2.imread(img2_dir) # RGB image
img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
l_img1 = []
l_img2 = []
for i in range(img1.shape[0]):
for j in range(img1.shape[1]):
l_img1.append((
0.2989*img1[i:i+1, j:j+1, 0:1] +
0.5870*img1[i:i+1, j:j+1, 1:2] +
0.1140*img1[i:i+1, j:j+1, 2:3]
).item())
l_img2.append((
0.2989*img2[i:i+1, j:j+1, 0:1] +
0.5870*img2[i:i+1, j:j+1, 1:2] +
0.1140*img2[i:i+1, j:j+1, 2:3]
).item())
print("brightness of img1")
print(np.mean(l_img1))
print("brightness of img2")
print(np.mean(l_img2))
print("img1-img2 correlation")
print(scipy.stats.pearsonr(l_img1, l_img2) )
plt.scatter(l_img1, l_img2, alpha = 0.05)
plt.xlabel('brightness of img1')
plt.ylabel('brightness of img2')
# sns.regplot(l_b_CD31, l_b_Lipo, alpha = 0.05)
⑾ Correlation between red and green in one 3D image
import cv2
import numpy as np
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import scipy.stats
import glob
def two_image_correlation_RG_3D(my_dir):
images = glob.glob(my_dir + "*.jpg")
l_img1 = []
l_img2 = []
for k in range(len(images)):
print(k)
img1 = cv2.imread(images[k])
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
for i in range(img1.shape[0]):
for j in range(img1.shape[1]):
if (img1[i:i+1, j:j+1, 0:1] + img1[i:i+1, j:j+1, 1:2] + img1[i:i+1, j:j+1, 2:3]) != 0:
l_img1.append((
img1[i:i+1, j:j+1, 0:1]
).item())
l_img2.append((
img1[i:i+1, j:j+1, 1:2]
).item())
print("brightness of img1")
print(np.mean(l_img1))
print("brightness of img2")
print(np.mean(l_img2))
print("img1-img2 correlation")
print(scipy.stats.pearsonr(l_img1, l_img2) )
plt.scatter(l_img1, l_img2, alpha = 0.01)
plt.xlabel('brightness of img1')
plt.ylabel('brightness of img2')
⑿ Code for obtaining an SSIM value for a pair of images
def SSIM(x, y):
# assumption : x and y are grayscale images with the same dimension
import numpy as np
def mean(img):
return np.mean(img)
def sigma(img):
return np.std(img)
def cov(img1, img2):
img1_ = np.array(img1[:,:], dtype=np.float64)
img2_ = np.array(img2[:,:], dtype=np.float64)
return np.mean(img1_ * img2_) - mean(img1) * mean(img2)
K1 = 0.01
K2 = 0.03
L = 256 # when each pixel spans 0 to 255
C1 = K1 * K1 * L * L
C2 = K2 * K2 * L * L
C3 = C2 / 2
l = (2 * mean(x) * mean(y) + C1) / (mean(x)**2 + mean(y)**2 + C1)
c = (2 * sigma(x) * sigma(y) + C2) / (sigma(x)**2 + sigma(y)**2 + C2)
s = (cov(x, y) + C3) / (sigma(x) * sigma(y) + C3)
return l * c * s
⒀ Code for obtaining the mutal information value of a pair of images (reference)
def mutual_information(img1, img2):
import numpy as np
import cv2
import matplotlib.pyplot as plt
# img1 and img2 are 3-channel color images
a = img1[:,:,0:1].reshape(img1.shape[0], img1.shape[1])
b = img2[:,:,0:1].reshape(img2.shape[0], img2.shape[1])
hgram, x_edges, y_edges = np.histogram2d(
a.ravel(),
b.ravel(),
bins=20
)
pxy = hgram / float(np.sum(hgram))
px = np.sum(pxy, axis=1) # marginal for x over y
py = np.sum(pxy, axis=0) # marginal for y over x
px_py = px[:, None] * py[None, :] # Broadcast to multiply marginals
nzs = pxy > 0 # Only non-zero pxy values contribute to the sum
return np.sum(pxy[nzs] * np.log(pxy[nzs] / px_py[nzs]))
⒁ Code for obtaining a regression curve
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit([[0,0], [1,1], [2,2]], [0, 1, 2])
# LinearRegression()
reg.coef_
# array([0.5, 0.5])
⒂ Create a spot mapping image when given a background image, x-coordinate, y-coordinate, and color c in a spatial transcriptomics (ST).
def spatial_featuremap(img, x, y, c):
tsimg = np.zeros(img.shape[:2])
tsimg_row = y # np.array형 변수
tsimg_col = x # np.array형 변수
for rr, cc, t in zip(tsimg_row, tsimg_col, c):
r, c = draw.circle(rr, cc, radius = 2.5)
tsimg[r,c]= t
return tsimg
⒃ A function that determines the embossing and intaglio of an image: If the return value (-1 to 1) is positive, it is embossing; otherwise, it is intaglio.
def is_embossing(image):
l_1 = []
l_2 = []
row_cm = image.shape[0]/2
col_cm = image.shape[1]/2
def distance(x:int, y:int):
import numpy as np
return np.sqrt(x*x + y*y)
def my_sqrt(x:int, y:int, z:int):
import numpy as np
return np.sqrt(x*x + y*y + z*z)
for i in range(image.shape[0]):
for j in range(image.shape[1]):
l_1.append( distance(i - row_cm, j - col_cm ))
if len(image.shape) == 2: # grayscale
l_2.append( image[i:i+1, j:j+1].item() )
elif len(image.shape) == 3: # 3-channel color image
l_2.append(
my_sqrt( image[i:i+1, j:j+1, 0:1].item(),
image[i:i+1, j:j+1, 1:2].item(),
image[i:i+1, j:j+1, 2:3].item()
)
)
print("Distance from Center")
print(np.mean(l_1))
print("Pixel Intensity")
print(np.mean(l_2))
plt.scatter(l_1, l_2, alpha = 0.01)
plt.ylim(0, 255)
plt.xlabel('Distance from Center')
plt.ylabel('Pixel Intensity')
return scipy.stats.pearsonr(l_1, l_2)[0] * (-1)
⒄ A useful code that combines multiple images to represent one image
fig, axes = plt.subplots(2,2,figsize=(6,20))
sns.barplot(df1[0:20], x='Gene', y ='Enrichment', ax = axes[0][0])
axes[0][0].tick_params(axis='x', rotation=90)
axes[0][0].set_title('DataFrame 1')
sns.barplot(df2[0:20], x='Gene', y ='Enrichment', ax = axes[0][1])
axes[0][1].tick_params(axis='x', rotation=90)
axes[0][1].set_title('DataFrame 2')
sns.barplot(df3[0:20], x='Gene', y ='Enrichment', ax = axes[1][0])
axes[1][0].tick_params(axis='x', rotation=90)
axes[1][0].set_title('DataFrame 3')
sns.barplot(df4[0:20], x='Gene', y ='Enrichment', ax = axes[1][1])
axes[1][1].tick_params(axis='x', rotation=90)
axes[1][1].set_title('DataFrame 4')
plt.tight_layout()
plt.show()
⒅ Code for decrypting when the base64 code of an image is truncated.
import base64
from PIL import Image, ImageFile
import io
# if the data is <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+0AA..."
data = "iVBORw0KGgoAAAANSUhEUgAAA+0AA..." # you need to modify this part
# If the length of the data is 'num' more than a multiple of 4, trim the last character(s)
num = len(data) % 4
data = data[:-num]
# Add padding to ensure the length is a multiple of 4
padding = '=' * ((4 - len(data) % 4) % 4)
data = data + padding
try:
decoded_data = base64.b64decode(data)
# Process the decoded data as needed
except Exception as e:
print("Error in decoding base64:", e)
image = Image.open(io.BytesIO(decoded_data))
ImageFile.LOAD_TRUNCATED_IMAGES = True
try:
image = Image.open(io.BytesIO(decoded_data))
image.show()
except IOError:
print("Cannot open the image.")
⒆ chart-to-text (ref)
⒇ Create a random image of the same size as the given image
import numpy as np
import cv2
# Load the image using OpenCV
tsimg = cv2.imread(imgfile)
# Get the dimensions of the original image
height, width, channels = tsimg.shape
# Create a random image with the same dimensions
# Random values are generated in the range 0-255, which is the range for pixel values in an 8-bit image
random_img = np.random.randint(0, 256, (height, width, channels), dtype=np.uint8)
# Optionally, you can save or display the random image using OpenCV
cv2.imwrite('random_image.png', random_img) # Save the random image
def otsu_thresholding(image_path):
import cv2
import matplotlib.pyplot as plt
# Load the image from the specified path
image = cv2.imread(image_path)
# Convert the image to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply Gaussian blur to the grayscale image
blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
# Perform Otsu's thresholding to obtain the initial thresholded image
_, initial_otsu_thresholded = cv2.threshold(blurred_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Modify the threshold value if needed
threshold_adjustment = 5
modified_threshold_value = _ + threshold_adjustment
_, modified_otsu_thresholded = cv2.threshold(blurred_image, modified_threshold_value, 255, cv2.THRESH_BINARY)
# Display the original grayscale, initial Otsu thresholded, and modified Otsu thresholded images
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.imshow(gray_image, cmap='gray')
plt.title('Grayscale Image')
plt.axis('off')
plt.subplot(1, 3, 2)
plt.imshow(initial_otsu_thresholded, cmap='gray')
plt.title('Initial Otsu Thresholded Image')
plt.axis('off')
plt.subplot(1, 3, 3)
plt.imshow(modified_otsu_thresholded, cmap='gray')
plt.title('Modified Otsu Thresholded Image')
plt.axis('off')
plt.show()
# Return the modified Otsu thresholded image
return modified_otsu_thresholded
⒇ Visualize images on the Internet
import matplotlib.pyplot as plt
import requests
from PIL import Image
from io import BytesIO
# URL of the image
image_url = "https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FmxJpC%2FbtsrErXwO2Q%2FCMhkpoHTn0Ht65uhdwkeSk%2Fimg.png"
# Loading the image
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))
# Visualization of the image
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.axis('off') # Removal of axis
plt.show()
7. Data Science
⑴ K means clustering
from sklearn.cluster import KMeans
X = [[1, 2, 3],[1, 2, 3.1],[2, 2, 4],[2, 2, 4.1]]
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
cluster_labels = kmeans.labels_
print(cluster_labels)
# [1 1 0 0]
⑵ Calculating the Gini Coefficient from a Sequence
def gini_coefficient(values):
# If the list of values is empty or all values are the same, the Gini coefficient is 0.
if not values or all(x == values[0] for x in values):
return 0.0
n = len(values)
mean_value = sum(values) / n
sum_of_differences = sum(abs(x - y) for x in values for y in values)
# Calculate the Gini coefficient
gini = sum_of_differences / (2 * n**2 * mean_value)
return gini
# Example data
income = [15, 20, 35, 40, 50]
# Calculating the Gini coefficient
gini_index = gini_coefficient(income)
print(f"The Gini coefficient is {gini_index:.2f}")
⑶ Calculate Shannon entry when given a sequence
import numpy as np
def shannon_index(data):
# Count unique values and their occurrences in the data
values, counts = np.unique(data, return_counts=True)
# Calculate proportions using the counts
proportions = counts / sum(counts)
# Calculate Shannon index, considering only non-zero proportions
return -np.sum(proportions * np.log(proportions))
# Example data, where '1' appears 3 times, '2' appears 2 times, and '3' appears 1 time
data = [1, 1, 1, 2, 2, 3]
# Calculate the Shannon index
index = shannon_index(data)
print(f"Shannon index: {index}")
8. Deep Learning Function
⑴ Deep learning example: Applying a deep learning model to the MNIST (i.e., handwritten digits) dataset.
import tensorflow as tf
from tensorflow.keras import layers, models
# Load the MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
# Normalize the pixel values of the images to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0
# Define a simple sequential model
def create_model():
model = models.Sequential([
layers.Flatten(input_shape=(28, 28)), # Flatten the 2D image to a 1D array
layers.Dense(128, activation='relu'), # First hidden layer with 128 neurons and ReLU activation
layers.Dropout(0.2), # Dropout layer to reduce overfitting
layers.Dense(10, activation='softmax') # Output layer with 10 neurons for 10 classes and softmax activation
])
return model
# Create the model
model = create_model()
# Compile the model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train the model
model.fit(train_images, train_labels, epochs=5)
# Evaluate the model
test_loss, test_acc = model.evaluate(test_images, test_labels)
print(f'Test accuracy: {test_acc}')
⑵ Algorithm to classify 10,000 Xte as KNN when 60,000 data Xtrs and 60,000 label Ytrs are present
import tensorflow as tf
(Xtr, Ytr), (Xte, Yte) = tf.keras.datasets.mnist.load_data()
print(Xtr.shape) # (60000, 28, 28)
print(Ytr.shape) # (60000,)
print(Xte.shape) # (10000, 28, 28)
print(Yte.shape) # (10000,)
Xtr_rows = Xtr.reshape(Xtr.shape[0], 28*28)
Xte_rows = Xte.reshape(Xte.shape[0], 28*28)
print(Xtr_rows.shape) # (60000, 784)
print(Xte_rows.shape) # (10000, 784)
def KNN_predict(Xtr_rows, Ytr, Xte_rows, dist_metric='L2'):
import numpy as np
class NearestNeighbor(object):
def __init__(self):
pass
def train(self, X, Y):
self.Xtr = X
self.Ytr = Y
def predict(self, X, dist_metric=dist_metric):
num_test = X.shape[0]
Ypred = np.zeros(num_test, dtype = self.Ytr.dtype)
for i in range(num_test):
if dist_metric=='L1': ## L1 distance
distances = np.sum(np.abs(self.Xtr - X[i, :]), axis = 1)
elif dist_metric=='L2': ## L2 distance
distances = np.sum(np.square(self.Xtr - X[i, :]), axis = 1)
elif dist_metric=='dot': ## dot distance
distances = np.dot(self.Xtr, X[i, :])
min_index = np.argmin(distances)
Ypred[i] = self.Ytr[min_index]
if i%100 == 0:
print(i)
return Ypred
nn = NearestNeighbor()
nn.train(Xtr_rows, Ytr)
Yte_predict = nn.predict(Xte_rows, dist_metric)
return Yte_predict
def KNN_evaluate(Xtr_rows, Ytr, Xte_rows, Yte, dist_metric='L2'):
Yte_predict = KNN_predict(Xtr_rows, Ytr, Xte_rows, dist_metric)
print ('accuracy: %f' + str(np.mean(Yte_predict == Yte)) )
# L1 : accuracy = 47.29%
# L2 : accuracy = 27.24%
# dot : accuracy = 9.9%
KNN_evaluate(Xtr_rows, Ytr, Xte_rows, Yte)
⑶ Binary classifier function implemented by 1D CNN
# reference : https://wikidocs.net/80783
def binary_predict_by_1D_CNN (Xtr_rows, Ytr, Xte_rows, Yte):
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
vocab_size = 10000
embedding_dim = 256 # 임베딩 벡터의 차원
dropout_ratio = 0.3 # 드롭아웃 비율
num_filters = 256 # 커널의 수
kernel_size = 3 # 커널의 크기
hidden_units = 128 # 뉴런의 수
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='sigmoid'))
model.summary()
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(Xtr_rows, Ytr, epochs=20, validation_data=(Xte_rows, Yte), callbacks=[es, mc])
def binary_evaluate_by_1D_CNN(Xtr_rows, Ytr, Xte_rows, Yte):
from tensorflow.keras.models import load_model
binary_predict_by_1D_CNN(Xtr_rows, Ytr, Xte_rows, Yte)
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(Xte_rows, Yte)[1]))
##################
# Example 1. Mnist
import tensorflow as tf
(Xtr, Ytr), (Xte, Yte) = tf.keras.datasets.mnist.load_data()
Xtr_rows = Xtr.reshape(Xtr.shape[0], 28*28)
Xte_rows = Xte.reshape(Xte.shape[0], 28*28)
print(Xtr_rows.shape) # (60000, 784)
print(Ytr.shape) # (60000,)
print(Xte_rows.shape) # (10000, 784)
print(Yte.shape) # (10000,)
binary_evaluate_by_1D_CNN(Xtr_rows, Ytr > 5, Xte_rows, Yte > 5)
# 1D CNN accuracy : 7.82%
##################
# Example 2. IMDB
from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)
max_len = 200
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
binary_evaluate_by_1D_CNN(X_train, y_train, X_test, y_test)
# 1D CNN accuracy : 88.92%
⑷ A function that extracts 512-dimensional features from any image via pre-trained CNN
# reference
## https://github.com/mexchy1000/spade/blob/master/spade_spatial_marker_by_deeplearning.py
import os
import numpy as np
import matplotlib.pyplot as plt
from skimage import draw
import pandas as pd
import argparse
from keras import backend as K
from keras.applications import vgg16, resnet50, inception_v3
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
K.set_image_data_format='channels_last'
os.environ['KERAS_BACKEND'] = 'tensorflow'
def features_512D_from_image_by_CNN (img, model_name = 'VGG16'):
# Image Patch
img_re = cv2.resize(img, dsize = (32, 32))
tspatches = []
tspatches.append(img_re)
tspatches.append(img_re) # intentional duplicate
tspatches = np.asarray(tspatches)
# Pretrained model
if model_name == 'VGG16':
pretrained_model = vgg16.VGG16(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
X_in = tspatches.copy()
X_in = vgg16.preprocess_input(X_in)
elif model_name == 'ResNet50':
pretrained_model = resnet50.ResNet50(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
X_in = tspatches.copy()
X_in = resnet50.preprocess_input(X_in)
elif model_name == 'InceptionV3':
pretrained_model = inception_v3.InceptionV3(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
X_in = tspatches.copy()
X_in = inception_v3.preprocess_input(X_in)
pretrained_model.trainable = False
pretrained_model.summary()
# Get the features
ts_features = pretrained_model.predict(X_in)
return ts_features[0]
⑸ A function that fetches BERT or BioBERT from the Hugging Face to create an attention matrix for the given sentence
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import seaborn as sns
# Load BERT model and tokenizer (OPTION 1)
'''
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_attentions=True)
'''
# Load BioBERT model and tokenizer (OPTION 2)
model_name = 'dmis-lab/biobert-base-cased-v1.1'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_attentions=True)
# Input sentence
sentence = "Find diseases associated with glucose"
# Tokenization
inputs = tokenizer(sentence, return_tensors='pt')
# Calculate output values and attention weights through the model
outputs = model(**inputs)
attentions = outputs.attentions # This value is the attention weight of each layer
# Visualization of the attention weight of the first head of the first layer
attention = attentions[0][0][0].detach().numpy()
# List of tokens
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
# Visualization of Attention Weights
plt.figure(figsize=(10, 10))
sns.heatmap(attention, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
plt.title('Attention Weights')
plt.show()
⑹ A function that converts arbitrary variable-length natural language sentences into 384 dimensions while considering their meanings. (cf. CeLLaMA)
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import torch
from torch.utils.data import DataLoader, TensorDataset
from xgboost import XGBClassifier
def sentences_to_embedding(sentences):
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = embedding_function.embed_documents(sentences)
emb_res = np.asarray(db)
return emb_res
sentences = []
sentences.append("What is the meaning of: obsolete")
sentences.append("What is the meaning of: old-fashioned")
sentences.append("What is the meaning of: demagogue")
emb_res = sentences_to_embedding(sentences)
⑺ Collection of Natural Language Processing and LLM Useful Functions
9. Exception Handling
⑴ Basic exception handling syntax
try:
raise KeyError("Error")
except:
print("An exception occurred")
# An exception occurred
⑵ print error message (ref.)
try:
print(1/0)
except Exception as e:
print(e)
10. Bioinformatics
⑴ Code for acquiring highly variable genes
import scanpy as sc
tissue_dir = './outs'
adata1 = sc.read_visium(tissue_dir)
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 3
adata1.var_names_make_unique()
sc.pp.normalize_total(adata1, inplace=True)
sc.pp.log1p(adata1)
sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=200)
gene_list=adata1.var.loc[adata1.var.highly_variable==True,:].index
⑵ Code for acquiring highest expressing genes
import scanpy as sc
import numpy as np
import pandas as pd
pbmc = sc.read_h5ad("Tabula_Sapiens.h5ad")
# normalize counts matrix so that each 'cell' (barcode) has counts summing to 1
pbmc.X_norm = sc.pp.normalize_total(pbmc, target_sum=1, inplace=False)['X']
# create new adata.var column contaning mean of each column of adata.X_norm above
# this is total normalized counts per gene a.k.a. 'mean_total_expression'
pbmc.var['mean_expression'] = np.ravel(pbmc.X_norm.mean(0))
# return pd.DataFrame of n top-ranked genes by mean expression
x = pd.DataFrame(pbmc.var.nlargest(200, 'mean_expression')['mean_expression'])
⑶ GO plot (ver. 1) (not recommended
)
# Reference
## https://github.com/mousepixels/sanbomics_scripts/blob/main/GO_in_python.ipynb
## https://snyk.io/advisor/python/goatools/functions/goatools.anno.genetogo_reader.Gene2GoReader
def GO(markers, max_terms = 6):
# type(markers) = numpy.ndarray
# dtype = object
if len(markers) == 0:
return
from genes_ncbi_mus_musculus_proteincoding import GENEID2NT as GeneID2nt_mus
from genes_ncbi_homo_sapiens_proteincoding import GENEID2NT as GeneID2nt_homo
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
obo_fname = download_go_basic_obo()
fin_gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")
#run one time to initialize
mapper = {}
if markers[0] == markers[0].upper(): # Homo sapiens
for key in GeneID2nt_homo:
mapper[GeneID2nt_homo[key].Symbol] = GeneID2nt_homo[key].GeneID
else: # Mus musculus
for key in GeneID2nt_mus:
mapper[GeneID2nt_mus[key].Symbol] = GeneID2nt_mus[key].GeneID
inv_map = {v: k for k, v in mapper.items()}
if markers[0] == markers[0].upper():
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
else:
objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
ns2assoc = objanno.get_ns2assc()
goeaobj = ''
if markers[0] == markers[0].upper():
goeaobj = GOEnrichmentStudyNS(
GeneID2nt_homo.keys(), # List of human protein-coding genes
ns2assoc, # geneid/GO associations
obodag, # Ontologies
propagate_counts = False,
alpha = 0.05, # default significance cut-off
methods = ['fdr_bh']) # defult multipletest correction method
else:
goeaobj = GOEnrichmentStudyNS(
GeneID2nt_mus.keys(), # List of mouse protein-coding genes
ns2assoc, # geneid/GO associations
obodag, # Ontologies
propagate_counts = False,
alpha = 0.05, # default significance cut-off
methods = ['fdr_bh']) # defult multipletest correction method
GO_items = []
temp = goeaobj.ns2objgoea['BP'].assoc
for item in temp:
GO_items += temp[item]
temp = goeaobj.ns2objgoea['CC'].assoc
for item in temp:
GO_items += temp[item]
temp = goeaobj.ns2objgoea['MF'].assoc
for item in temp:
GO_items += temp[item]
def go_it(test_genes):
print(f'input genes: {len(test_genes)}')
mapped_genes = []
for gene in test_genes:
try:
mapped_genes.append(mapper[gene])
except:
pass
print(f'mapped genes: {len(mapped_genes)}')
goea_results_all = goeaobj.run_study(mapped_genes)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
GO = pd.DataFrame(list(map(lambda x: [x.GO, x.goterm.name, x.goterm.namespace, x.p_uncorrected, x.p_fdr_bh,\
x.ratio_in_study[0], x.ratio_in_study[1], GO_items.count(x.GO), list(map(lambda y: inv_map[y], x.study_items)),\
], goea_results_sig)), columns = ['GO', 'term', 'class', 'p', 'p_corr', 'n_genes',\
'n_study', 'n_go', 'study_genes'])
GO = GO[GO.n_genes > 1]
return GO
df = go_it(markers)
df['per'] = df.n_genes/df.n_go
if df.shape[0] > 0:
df = df.sort_values(by=['p_corr'], axis=0, ascending=True)
if df.shape[0] > max_terms:
df = df[0:max_terms]
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
import seaborn as sns
import textwrap
fig, ax = plt.subplots(figsize = (0.5, 2.75))
cmap = mpl.cm.bwr_r
norm = mpl.colors.Normalize(vmin = df.p_corr.min(), vmax = df.p_corr.max())
mapper = cm.ScalarMappable(norm = norm, cmap = cm.bwr_r)
cbl = mpl.colorbar.ColorbarBase(ax, cmap = cmap, norm = norm, orientation = 'vertical')
plt.figure(figsize = (2,4))
ax = sns.barplot(data = df, x = 'per', y = 'term', palette = mapper.to_rgba(df.p_corr.values))
ax.set_yticklabels([textwrap.fill(e, 22) for e in df['term']])
plt.show()
⑷ GO plot (ver. 2) (recommended
)
def GO(gene_list):
# reference : https://gseapy.readthedocs.io/en/latest/gseapy_example.html
import gseapy
from gseapy import barplot, dotplot
if gene_list[0] == gene_list[0].upper():
enr = gseapy.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt",
gene_sets=['GO_Biological_Process_2018','GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'],
organism='human', # don't forget to set organism to the one you desired! e.g. Yeast
outdir=None, # don't write to disk
)
else:
enr = gseapy.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt",
gene_sets=['GO_Biological_Process_2018','GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'],
organism='mouse', # don't forget to set organism to the one you desired! e.g. Yeast
outdir=None, # don't write to disk
)
try:
ax = dotplot(enr.results,
column="Adjusted P-value",
x='Gene_set', # set x axis, so you could do a multi-sample/library comparsion
size=10,
top_term=5,
figsize=(3,5),
title = "GO plot",
xticklabels_rot=45, # rotate xtick labels
show_ring=True, # set to False to revmove outer ring
marker='o',
)
except:
print("ValueError: Warning: No enrich terms when cutoff = 0.05")
① gseapy appears incompatible with plt.subplot.
⑸ convert from ensembl.gene to gene.symbol using a table
def ensembl_to_gene(ensembl_list):
ar = []
human = pd.read_csv("https://blog.kakaocdn.net/dn/29YTj/btrS5iG9QOH/Di6RQKxHOPDii7EjkdHN30/human_genes_36601.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = None)
mouse = pd.read_csv("https://blog.kakaocdn.net/dn/wkjwJ/btrS1QSgrpD/VS8ELANCQyeZAA3vL8JQP0/mouse_genes_32285.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = None)
for i in range(len(ensembl_list)):
if 'ENSG' in ensembl_list[i]: # human gene
index = human[0].tolist().index(ensembl_list[i])
ar.append(human[1][index])
elif 'ENSMUSG' in ensembl_list[i]: # mouse gene
index = mouse[0].tolist().index(ensembl_list[i])
ar.append(mouse[1][index])
return(ar)
⑹ convert from gene.symbol to ensembl.gene using a table
def gene_to_ensembl(gene_list):
ar = []
human = pd.read_csv("https://blog.kakaocdn.net/dn/29YTj/btrS5iG9QOH/Di6RQKxHOPDii7EjkdHN30/human_genes_36601.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = None)
mouse = pd.read_csv("https://blog.kakaocdn.net/dn/wkjwJ/btrS1QSgrpD/VS8ELANCQyeZAA3vL8JQP0/mouse_genes_32285.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = None)
for i in range(len(gene_list)):
if gene_list[i] == gene_list[i].upper(): # human gene
index = human[1].tolist().index(gene_list[i])
ar.append(human[0][index])
else: # mouse gene
index = mouse[1].tolist().index(gene_list[i])
ar.append(mouse[0][index])
return(ar)
⑺ Human genes, mouse genes interconversion
import pandas as pd
import numpy as np
def find_idx(my_list, e):
for i in range(len(my_list)):
if my_list[i] == e:
return i
return -1
def human_to_mouse(human_gene:list):
hom = pd.read_csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")
mouse_gene = []
for i in range(len(human_gene)):
index = find_idx(hom['Symbol'], human_gene[i])
DB_Class_Key = hom['DB Class Key'][index]
hom_ = hom[hom['DB Class Key'] == DB_Class_Key]
hom__ = hom_[hom_['Common Organism Name'] == 'mouse, laboratory']
mouse_gene.append(np.array(hom__['Symbol'])[0])
return mouse_gene
def mouse_to_human(mouse_gene:list):
hom <- pd.read_csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")
human_gene = []
for i in range(len(mouse_gene)):
index = find_idx(hom['Symbol'], mouse_gene[i])
DB_Class_Key = hom['DB Class Key'][index]
hom_ = hom[hom['DB Class Key'] == DB_Class_Key]
hom__ = hom_[hom_['Common Organism Name'] == 'human']
mouse_gene.append(np.array(hom__['Symbol'])[0])
return human_gene
⑻ When there is a directory tissue_dir containing matrix.mtx.gz, barcodes.tsv, features.tsv, and the spatial folder, the code to read Visium data in Python.
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.image import imread
import numpy as np
import cv2
import math
from pathlib import Path, PurePath
from typing import Union, Dict, Optional, Tuple, BinaryIO
import json
def to_spatial(adata, path, library_id = 'TNBC1'):
adata.uns["spatial"][library_id] = dict()
path = Path(path)
files = dict(
tissue_positions_file=path / 'spatial/tissue_positions_list.csv',
scalefactors_json_file=path / 'spatial/scalefactors_json.json',
hires_image=path / 'spatial/tissue_hires_image.png',
lowres_image=path / 'spatial/tissue_lowres_image.png',
)
# check if files exists, continue if images are missing
for f in files.values():
if not f.exists():
if any(x in str(f) for x in ["hires_image", "lowres_image"]):
logg.warning(
f"You seem to be missing an image file.\n"
f"Could not find '{f}'."
)
else:
raise OSError(f"Could not find '{f}'")
adata.uns["spatial"][library_id]['images'] = dict()
for res in ['hires', 'lowres']:
try:
adata.uns["spatial"][library_id]['images'][res] = imread(
str(files[f'{res}_image'])
)
except Exception:
raise OSError(f"Could not find '{res}_image'")
# read json scalefactors
adata.uns["spatial"][library_id]['scalefactors'] = json.loads(
files['scalefactors_json_file'].read_bytes()
)
#adata.uns["spatial"][library_id]["metadata"] = {
# k: (str(attrs[k], "utf-8") if isinstance(attrs[k], bytes) else attrs[k])
# for k in ("chemistry_description", "software_version")
# if k in attrs
#}
# read coordinates
positions = pd.read_csv(files['tissue_positions_file'], header=None)
positions.columns = [
'barcode',
'in_tissue',
'array_row',
'array_col',
'pxl_col_in_fullres',
'pxl_row_in_fullres',
]
positions.index = positions['barcode']
adata.obs = adata.obs.join(positions, how="left")
adata.obsm['spatial'] = adata.obs[
['pxl_row_in_fullres', 'pxl_col_in_fullres']
].to_numpy()
adata.obs.drop(
columns=['barcode', 'pxl_row_in_fullres', 'pxl_col_in_fullres'],
inplace=True,
)
return adata
def load_adata(tissue_dir):
adata1 = sc.read_mtx(tissue_dir + 'matrix.mtx')
adata1 = adata1.transpose()
cellname = pd.read_csv(tissue_dir + 'barcodes.tsv', sep="\t", header=None, index_col=1, names=['idx', 'barcode'])
cellname.index = cellname['idx']
featurename = pd.read_csv(tissue_dir + 'features.tsv', sep='\t', header=None, index_col=1, names=['gene_ids', 'feature_types'])
adata1.var = featurename
adata1.obs = cellname
adata1.uns["spatial"] = dict()
adata1 = to_spatial(adata1, tissue_dir, library_id='Library_1')
adata1 = adata1[adata1.obs.in_tissue == 1,:]
adata1.var_names_make_unique()
adata1.obs['unnormalized_counts'] = adata1.X.sum(axis=1).A1
sc.pp.normalize_total(adata1, inplace=True)
adata1.obs['normalized_counts'] = adata1.X.sum(axis=1).A1
sc.pp.log1p(adata1)
sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=2000)
return adata1
⑼ A collection of Python functions related to organic chemistry
⑽ Acquire the human and mouse gene sequences (but only human genes are required)
from Bio import Entrez
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import pandas as pd
def human_mouse_gene_sequence(input_gene_name):
# Entrez email setup (required by NCBI)
Entrez.email = "nate9389@gmail.com"
# Function to fetch the sequence using gene ID
def fetch_sequence(gene_id):
handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
handle.close()
return record.seq
def get_gene_ids(input_gene, file_path="https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv"):
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Normalize the input gene name to uppercase to match the 'Symbol' column
input_gene = input_gene.upper()
# Find the row with the human gene symbol
human_row = df[(df['Common Organism Name'] == 'human') & (df['Symbol'].str.upper() == input_gene)]
# If the human gene is not found, return an error message
if human_row.empty:
return "Error: Human gene not found.", None
# Find the corresponding mouse gene using the DB Class Key
db_class_key = human_row['DB Class Key'].values[0]
mouse_row = df[(df['Common Organism Name'] == 'mouse, laboratory') & (df['DB Class Key'] == db_class_key)]
# If the mouse gene is not found, return an error message
if mouse_row.empty:
return None, "Error: Corresponding mouse gene not found."
# Retrieve the gene IDs
human_gene_id = human_row['Nucleotide RefSeq IDs'].values[0].split(',')[0] # Taking the first RefSeq ID if multiple are present
mouse_gene_id = mouse_row['Nucleotide RefSeq IDs'].values[0].split(',')[0] # Taking the first RefSeq ID if multiple are present
return human_gene_id, mouse_gene_id
# Get the human and mouse gene IDs
human_gene_id, mouse_gene_id = get_gene_ids(input_gene_name)
# Fetch the sequences
human_sequence = fetch_sequence(human_gene_id)
mouse_sequence = fetch_sequence(mouse_gene_id)
return human_sequence, mouse_sequence
### Example
human, mouse = human_mouse_gene_sequence("COL1A1")
⑾ Python code to put an image into Visium anndata obs.
import scanpy as sc
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
tissue_dir1 = './outs/'
adata1 = sc.read_visium(tissue_dir1)
img1 = cv2.imread(tissue_dir1 + 'spatial/tissue_hires_image.png')
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
hires = adata1.uns['spatial']['Parent_Visium_Human_OvarianCancer']['images']['hires']
tissue_hires_scalef = adata1.uns['spatial']['Parent_Visium_Human_OvarianCancer']['scalefactors']['tissue_hires_scalef']
coords = adata1.obsm['spatial'] * tissue_hires_scalef
adata1.obs['img'] = adata1.obs['in_tissue'] # just making a variable
for i in range(adata1.shape[0]):
x_coord = int(np.round(coords[i][0]))
y_coord = int(np.round(coords[i][1]))
# Ensure the coordinates are within the bounds of the image
x_coord = max(0, min(x_coord, hires.shape[1] - 1))
y_coord = max(0, min(y_coord, hires.shape[0] - 1))
# Extract the intensity of the pixel (for simplicity, using the red channel)
intensity = hires[y_coord, x_coord, 0] # Red channel
adata1.obs['img'][i] = intensity
sc.pl.spatial(adata1, color = 'img', img_key=None, spot_size=300)
⑿ Code for reading Visium ST data as AnnData and then performing data QC (Quality Control) operations
def QC(tissue_dir):
metafile = None
adata1 = sc.read_visium(tissue_dir)
### alternatively ###
# adata1 = sc.read_10x_mtx(tissue_dir)
adata1.var_names_make_unique()
sc.pp.normalize_total(adata1, inplace=True)
sc.pp.log1p(adata1)
# sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=17189)
adata1.obs['n_counts'] = adata1.X.sum(axis=1).A1
sc.pl.spatial(adata1, img_key="hires", color='n_counts', alpha=0.7, cmap='jet')
return adata1
⒀ Enlarge sc.pl.spatial
size.
fig, ax = plt.subplots(figsize=(10,10))
sc.pl.spatial(adata, color = 'total_counts', ax=ax, show=False)
plt.show()
⒁ Remove color bar from sc.pl.spatial
import matplotlib.pyplot as plt
import scanpy as sc
sc.pl.spatial(adata, img_key="hires", color='score', alpha=1.0, cmap='viridis', show=False)
fig = plt.gcf()
ax = fig.axes
# Find and remove the colorbar. The colorbar is usually the last axis in the figure.
if len(ax) > 1: # If there's more than one axis (i.e., the colorbar is present)
fig.delaxes(ax[-1]) # Remove the last axis (the colorbar)
plt.show()
⒂ Transpose adata: swap gene name and barcode name
import scanpy as sc
import pandas as pd
# Assuming 'data' is your AnnData object
# Load your AnnData object if it's not already in memory
# data = sc.read_h5ad('/path/to/your/data.h5ad')
# Transpose the data matrix and swap the 'obs' and 'var' DataFrames
adata_transposed = sc.AnnData(X=data.X.T, obs=data.var, var=data.obs)
# If 'obs_names' and 'var_names' need to be retained, you might want to transfer them as well
adata_transposed.obs_names = data.var_names
adata_transposed.var_names = data.obs_names
# Now 'adata_transposed' is the transposed version of ‘data'
⒃ Creating an h5ad file when you have matrix.mtx, barcodes.tsv, and features.tsv (or genes.tsv): Assume that all three files have the same prefix.
import os
import glob
from scipy.io import mmread
import pandas as pd
import scanpy as sc
# Set the target directory here
target_directory = '/path/to/your/directory'
# Iterate over all .mtx files in the target directory
for mtx_file in glob.glob(os.path.join(target_directory, '*matrix.mtx')):
prefix = mtx_file[:-10] # Remove 'matrix.mtx' to get the prefix
# Construct file names for barcodes and genes
barcodes_file = prefix + 'barcodes.tsv'
features_file = prefix + 'features.tsv'
genes_file = prefix + 'features.tsv' if os.path.exists(prefix + 'features.tsv') else prefix + 'genes.tsv'
# Check if all files exist
if os.path.exists(mtx_file) and os.path.exists(barcodes_file) and os.path.exists(genes_file):
# Load the data
matrix = mmread(mtx_file).T.tocsr() # Transpose to have genes as columns
genes = pd.read_csv(genes_file, header=None, sep='\t')
barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
# Create an AnnData object
adata = sc.AnnData(X=matrix, obs={'barcodes': barcodes[0].values}, var={'gene_ids': genes[0].values, 'gene_symbols': genes[1].values})
# Output file name
h5ad_file = prefix + '.h5ad'
# Save the AnnData object
adata.write(h5ad_file)
# Delete the original files
os.remove(mtx_file)
os.remove(barcodes_file)
os.remove(genes_file)
print("Conversion complete. Original files have been deleted.")
⒄ Code to concatenate a list of adata objects into a single adata
# Note that adatas = [adata1, adata2, ...]
adata_concat = sc.concat(adatas, keys="sample1 sample2 sample3 sample4 sample5 sample6".split(" "), index_unique='_', label='sample')
⒅ Code for providing the number of reads per barcode from an H5 file.
def get_mapped_reads(h5_file):
with h5py.File(h5_file, 'r') as f:
# Access the matrix group in the HDF5 file
matrix_group = f['matrix']
# Read the barcode, gene, and data datasets
barcodes = np.array(matrix_group['barcodes']).astype(str)
genes = np.array(matrix_group['features/name']).astype(str)
data = np.array(matrix_group['data'])
# Sum the data for each barcode to get the number of mapped reads per barcode
barcode_read_counts = np.array(matrix_group['indptr'][1:] - matrix_group['indptr'][:-1])
return barcodes, genes, barcode_read_counts
⒆ Code for providing the features (genes) from an h5 file.
import h5py
# Path to your HDF5 file
file_path = f'./{datas[0]}/filtered_feature_bc_matrix.h5' # Replace with your actual file path
# Open the HDF5 file and inspect its contents
with h5py.File(file_path, 'r') as f:
# List all groups in the file
print("Keys in HDF5 file:")
print(list(f.keys()))
# Access the 'matrix/features/name' to check what features are present
features = f['matrix/features/name'][:]
# Convert byte strings to normal strings (if necessary)
features = [feature.decode('utf-8') for feature in features]
print("\nFeatures in the HDF5 file:")
for feature in features:
print(feature)
Input: 2022.05.07 00:43