import pegasus as pg


data = pg.read_input("MantonBM_nonmix_subset.zarr.zip")
data

2022-04-10 17:53:45,594 - pegasusio.readwrite - INFO - zarr file 'MantonBM_nonmix_subset.zarr.zip' is loaded.
2022-04-10 17:53:45,595 - pegasusio.readwrite - INFO - Function 'read_input' finished in 0.29s.

MultimodalData object with 1 UnimodalData: 'GRCh38-rna'
    It currently binds to UnimodalData object GRCh38-rna

UnimodalData object with n_obs x n_vars = 48219 x 36601
    Genome: GRCh38; Modality: rna
    It contains 1 matrix: 'X'
    It currently binds to matrix 'X' as X

    obs: 'n_genes', 'Channel', 'gender'
    var: 'featureid'
    uns: 'genome', 'modality'


pg.qc_metrics(data, min_genes=500, max_genes=6000, mito_prefix='MT-', percent_mito=10)
pg.filter_data(data)
pg.identify_robust_genes(data)
pg.log_norm(data)

2022-04-10 17:53:45,917 - pegasusio.qc_utils - INFO - After filtration, 35465 out of 48219 cell barcodes are kept in UnimodalData object GRCh38-rna.
2022-04-10 17:53:45,917 - pegasus.tools.preprocessing - INFO - Function 'filter_data' finished in 0.16s.
2022-04-10 17:53:46,303 - pegasus.tools.preprocessing - INFO - After filtration, 25653/36601 genes are kept. Among 25653 genes, 17516 genes are robust.
2022-04-10 17:53:46,304 - pegasus.tools.preprocessing - INFO - Function 'identify_robust_genes' finished in 0.39s.
2022-04-10 17:53:46,821 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 0.52s.


data.obs['Channel'].value_counts()

MantonBM2_HiSeq_1    4935
MantonBM6_HiSeq_1    4665
MantonBM8_HiSeq_1    4511
MantonBM7_HiSeq_1    4452
MantonBM1_HiSeq_1    4415
MantonBM3_HiSeq_1    4225
MantonBM4_HiSeq_1    4172
MantonBM5_HiSeq_1    4090
Name: Channel, dtype: int64


data_baseline = data.copy()
pg.highly_variable_features(data_baseline)

2022-04-10 17:53:47,037 - pegasus.tools.hvf_selection - INFO - Function 'estimate_feature_statistics' finished in 0.11s.
2022-04-10 17:53:47,063 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2022-04-10 17:53:47,063 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.14s.


pg.pca(data_baseline)
pg.neighbors(data_baseline)
pg.louvain(data_baseline)
pg.umap(data_baseline)
pg.scatter(data_baseline, attrs=['louvain_labels', 'Channel'], basis='umap')

2022-04-10 17:53:51,618 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 4.55s.
2022-04-10 17:53:55,909 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.29s.
2022-04-10 17:53:56,823 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.91s.
2022-04-10 17:53:57,954 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.13s.
2022-04-10 17:54:10,621 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 19 clusters.
2022-04-10 17:54:10,621 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 13.80s.
2022-04-10 17:54:10,622 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 17:54:10,622 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
UMAP(min_dist=0.5, precomputed_knn=(array([[    0, 30526, 27514, ..., 26911, 222, 25411],
       [    1, 29723, 2651, ..., 4946, 1177, 30829],
       [    2, 35262, 20170, ..., 2478, 34953, 19206],
       ...,
       [35462, 6096, 30824, ..., 15433, 19601, 30227],
       [35463, 34252, 34709, ..., 5765, 32603, 31084],
       [35464, 5379, 35401, ..., 10528, 24551, 27891]]), array([[ 0.       , 4.9841695, 5.4866176, ..., 5.8861804, 5.899298 ,
         5.9012275],
       [ 0.       , 8.27516  , 8.679712 , ..., 9.440161 , 9.484825 ,
         9.490481 ],
       [ 0.       , 4.7937765, 5.125005 , ..., 5.489494 , 5.4935   ,
         5.495704 ],
       ...,
       [ 0.       , 7.368082 , 7.414498 , ..., 8.2814455, 8.314083 ,
         8.34855  ],
       [ 0.       , 8.759402 , 8.7638645, ..., 10.250344 , 10.255535 ,
        10.261412 ],
       [ 0.       , 7.1296797, 7.2653217, ..., 8.02741  , 8.158878 ,
         8.207317 ]], dtype=float32), <pegasus.tools.visualization.DummyNNDescent object at 0x7fe5d373cf70>), random_state=0, verbose=True)
Sun Apr 10 17:54:10 2022 Construct fuzzy simplicial set
Sun Apr 10 17:54:12 2022 Construct embedding

Epochs completed:   0%|            0/200 [00:00]

Sun Apr 10 17:54:28 2022 Finished embedding
2022-04-10 17:54:28,276 - pegasus.tools.visualization - INFO - Function 'umap' finished in 17.65s.


pg.highly_variable_features(data, batch='Channel')

2022-04-10 17:54:29,181 - pegasus.tools.hvf_selection - INFO - Function 'estimate_feature_statistics' finished in 0.13s.
2022-04-10 17:54:29,218 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2022-04-10 17:54:29,218 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.17s.


data_harmony = data.copy()
pg.pca(data_harmony)

2022-04-10 17:54:33,877 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 4.55s.


harmony_key = pg.run_harmony(data_harmony, use_gpu=True)

2022-04-10 17:54:34,174 - pegasus.tools.batch_correction - INFO - Start integration using Harmony.
Use GPU mode.
	Initialization is completed.
	Completed 1 / 10 iteration(s).
	Completed 2 / 10 iteration(s).
	Completed 3 / 10 iteration(s).
	Completed 4 / 10 iteration(s).
	Completed 5 / 10 iteration(s).
	Completed 6 / 10 iteration(s).
	Completed 7 / 10 iteration(s).
	Completed 8 / 10 iteration(s).
Reach convergence after 8 iteration(s).
2022-04-10 17:54:45,486 - pegasus.tools.batch_correction - INFO - Function 'run_harmony' finished in 11.61s.


pg.neighbors(data_harmony, rep=harmony_key)
pg.louvain(data_harmony, rep=harmony_key)
pg.umap(data_harmony, rep=harmony_key)

2022-04-10 17:54:50,238 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.75s.
2022-04-10 17:54:51,306 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 1.07s.
2022-04-10 17:54:52,483 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.18s.
2022-04-10 17:55:05,058 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 17 clusters.
2022-04-10 17:55:05,083 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 13.78s.
2022-04-10 17:55:05,084 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 17:55:05,084 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
UMAP(min_dist=0.5, precomputed_knn=(array([[    0, 32454, 33203, ..., 34421, 4585, 33415],
       [    1, 7658, 32973, ..., 33173, 21503, 20769],
       [    2, 13228, 20787, ..., 27422, 28285, 34507],
       ...,
       [35462, 31460, 28289, ..., 11139, 21229, 20250],
       [35463, 34709, 8206, ..., 26634, 1453, 17529],
       [35464, 35401, 8702, ..., 13853, 32612, 24418]]), array([[ 0.       , 5.1344194, 5.189243 , ..., 5.800482 , 5.834844 ,
         5.835115 ],
       [ 0.       , 7.7050667, 8.047325 , ..., 8.791688 , 8.835943 ,
         8.856789 ],
       [ 0.       , 4.039253 , 4.5694942, ..., 5.176337 , 5.177398 ,
         5.2111526],
       ...,
       [ 0.       , 6.6650157, 6.7569337, ..., 7.6235266, 7.6727905,
         7.6997538],
       [ 0.       , 8.145101 , 9.145093 , ..., 10.019776 , 10.028579 ,
        10.075567 ],
       [ 0.       , 5.8484764, 5.9088817, ..., 7.372008 , 7.3747354,
         7.4893804]], dtype=float32), <pegasus.tools.visualization.DummyNNDescent object at 0x7fe5d340f220>), random_state=0, verbose=True)
Sun Apr 10 17:55:05 2022 Construct fuzzy simplicial set
Sun Apr 10 17:55:05 2022 Construct embedding

Epochs completed:   0%|            0/200 [00:00]

Sun Apr 10 17:55:21 2022 Finished embedding
2022-04-10 17:55:21,426 - pegasus.tools.visualization - INFO - Function 'umap' finished in 16.34s.


pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap')


data_inmf = data.copy()
inmf_key = pg.integrative_nmf(data_inmf, batch='Channel', use_gpu=True)

Use GPU mode.
Pass 1, loss=117.82050637479075.
Pass 2, loss=116.58384274382114.
Pass 3, loss=116.4230007085977.
Pass 4, loss=116.39696546970904.
Pass 5, loss=116.41854145467376.
    Converged after 5 pass(es).
Final loss=116.36377528355716.
2022-04-10 17:55:39,047 - pegasus.tools.nmf - INFO - Function 'integrative_nmf' finished in 16.60s.


pg.neighbors(data_inmf, rep=inmf_key)
pg.louvain(data_inmf, rep=inmf_key)
pg.umap(data_inmf, rep=inmf_key)

2022-04-10 17:55:41,820 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.77s.
2022-04-10 17:55:42,813 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.99s.
2022-04-10 17:55:43,839 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.02s.
2022-04-10 17:56:00,218 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 21 clusters.
2022-04-10 17:56:00,239 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 17.43s.
2022-04-10 17:56:00,240 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 17:56:00,240 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
UMAP(min_dist=0.5, precomputed_knn=(array([[    0, 12123, 27568, ..., 34421, 11193, 18934],
       [    1, 1594, 17631, ..., 20095, 26877, 14073],
       [    2, 27342, 24506, ..., 12901, 24727, 10017],
       ...,
       [35462, 23334, 9671, ..., 7246, 32709, 33849],
       [35463, 30298, 11833, ..., 4739, 13777, 29583],
       [35464, 8702, 19094, ..., 25361, 2053, 19537]]), array([[0.        , 0.00494586, 0.00532262, ..., 0.00648283, 0.00656692,
        0.00662607],
       [0.        , 0.01005144, 0.01099363, ..., 0.01310497, 0.01326502,
        0.0133247 ],
       [0.        , 0.00567458, 0.00618163, ..., 0.00720941, 0.00721262,
        0.00722609],
       ...,
       [0.        , 0.00570703, 0.00864239, ..., 0.01141857, 0.01165916,
        0.01195962],
       [0.        , 0.00877831, 0.00937465, ..., 0.01553986, 0.01563119,
        0.01570668],
       [0.        , 0.0112036 , 0.01227087, ..., 0.02012998, 0.02018525,
        0.02046473]], dtype=float32), <pegasus.tools.visualization.DummyNNDescent object at 0x7fe46fe1d460>), random_state=0, verbose=True)
Sun Apr 10 17:56:00 2022 Construct fuzzy simplicial set
Sun Apr 10 17:56:00 2022 Construct embedding

Epochs completed:   0%|            0/200 [00:00]

Sun Apr 10 17:56:16 2022 Finished embedding
2022-04-10 17:56:16,863 - pegasus.tools.visualization - INFO - Function 'umap' finished in 16.62s.


pg.scatter(data_inmf, attrs=['louvain_labels', 'Channel'], basis='umap')


data_scvi = data.copy()
scvi_key = pg.run_scvi(data_scvi, batch='Channel', use_gpu=True)

Global seed set to 0

2022-04-10 17:56:18,633 - pegasus.tools.scvitools - INFO - Start embedding using scVI.

Global seed set to 0
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Epoch 1/226:   0%|                                                                                                                                                                         | 0/226 [00:00<?, ?it/s]

/opt/miniconda3/envs/py39/lib/python3.9/site-packages/scvi/distributions/_negative_binomial.py:61: UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /home/yiming/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at  ../aten/src/ATen/native/cuda/jit_utils.cpp:860.)
  + torch.lgamma(x + theta)

Epoch 226/226: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 226/226 [07:53<00:00,  2.10s/it, loss=419, v_num=1]
2022-04-10 18:04:12,810 - pegasus.tools.scvitools - INFO - Function 'run_scvi' finished in 474.88s.


pg.neighbors(data_scvi, rep=scvi_key)
pg.louvain(data_scvi, rep=scvi_key)
pg.umap(data_scvi, rep=scvi_key)

2022-04-10 18:04:15,852 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 3.04s.
2022-04-10 18:04:16,851 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 1.00s.
2022-04-10 18:04:17,851 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.00s.
2022-04-10 18:04:29,983 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 18 clusters.
2022-04-10 18:04:30,006 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 13.15s.
2022-04-10 18:04:30,007 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:04:30,008 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
UMAP(min_dist=0.5, precomputed_knn=(array([[    0, 23499, 1608, ..., 3344, 25541, 4372],
       [    1, 231, 3185, ..., 12137, 18091, 18563],
       [    2, 18679, 18185, ..., 11549, 9443, 12769],
       ...,
       [35462, 35177, 28976, ..., 28867, 6565, 14087],
       [35463, 32162, 33629, ..., 31600, 12355, 34988],
       [35464, 2586, 31236, ..., 14480, 10528, 13114]]), array([[0.        , 1.1778232 , 1.2298497 , ..., 1.5208108 , 1.5408028 ,
        1.5542879 ],
       [0.        , 0.69611096, 0.6967799 , ..., 0.90967846, 0.9107625 ,
        0.9199178 ],
       [0.        , 0.4717493 , 0.74500614, ..., 0.927016  , 0.93040264,
        0.9329058 ],
       ...,
       [0.        , 0.667613  , 0.6786024 , ..., 0.7649085 , 0.77723014,
        0.7794315 ],
       [0.        , 0.72530776, 0.86726284, ..., 1.1541992 , 1.1636413 ,
        1.1666512 ],
       [0.        , 0.80819905, 0.8488365 , ..., 1.1961058 , 1.2048146 ,
        1.2279631 ]], dtype=float32), <pegasus.tools.visualization.DummyNNDescent object at 0x7fe534262310>), random_state=0, verbose=True)
Sun Apr 10 18:04:30 2022 Construct fuzzy simplicial set
Sun Apr 10 18:04:30 2022 Construct embedding

Epochs completed:   0%|            0/200 [00:00]

Sun Apr 10 18:04:45 2022 Finished embedding
2022-04-10 18:04:45,061 - pegasus.tools.visualization - INFO - Function 'umap' finished in 15.05s.


pg.scatter(data_scvi, attrs=['louvain_labels', 'Channel'], basis='umap')


data_scan = data.copy()
scan_key = pg.run_scanorama(data_scan)

2022-04-10 18:04:46,118 - pegasus.tools.batch_correction - INFO - Start integration using Scanorama.
Found 2000 genes among all datasets
[[0.         0.69648924 0.29349112 0.49263873 0.51393643 0.46345123
  0.57553794 0.40057637]
 [0.         0.         0.13964497 0.6662614  0.32518337 0.6006079
  0.6674772  0.22611394]
 [0.         0.         0.         0.17761266 0.61775148 0.09046088
  0.17017751 0.55100592]
 [0.         0.         0.         0.         0.45427873 0.63965702
  0.55417066 0.43094658]
 [0.         0.         0.         0.         0.         0.40880196
  0.49144254 0.67479495]
 [0.         0.         0.         0.         0.         0.
  0.6829582  0.32604502]
 [0.         0.         0.         0.         0.         0.
  0.         0.60053203]
 [0.         0.         0.         0.         0.         0.
  0.         0.        ]]
Processing datasets (0, 1)
Processing datasets (5, 6)
Processing datasets (4, 7)
Processing datasets (1, 6)
Processing datasets (1, 3)
Processing datasets (3, 5)
Processing datasets (2, 4)
Processing datasets (1, 5)
Processing datasets (6, 7)
Processing datasets (0, 6)
Processing datasets (3, 6)
Processing datasets (2, 7)
Processing datasets (0, 4)
Processing datasets (0, 3)
Processing datasets (4, 6)
Processing datasets (0, 5)
Processing datasets (3, 4)
Processing datasets (3, 7)
Processing datasets (4, 5)
Processing datasets (0, 7)
Processing datasets (5, 7)
Processing datasets (1, 4)
Processing datasets (0, 2)
Processing datasets (1, 7)
Processing datasets (2, 3)
Processing datasets (2, 6)
Processing datasets (1, 2)
2022-04-10 18:06:46,757 - pegasus.tools.batch_correction - INFO - Function 'run_scanorama' finished in 120.65s.


pg.neighbors(data_scan, rep=scan_key)
pg.louvain(data_scan, rep=scan_key)
pg.umap(data_scan, rep=scan_key)

2022-04-10 18:06:52,029 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 5.27s.
2022-04-10 18:06:52,958 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.93s.
2022-04-10 18:06:54,059 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.10s.
2022-04-10 18:07:05,554 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 18 clusters.
2022-04-10 18:07:05,584 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 12.62s.
2022-04-10 18:07:05,585 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:05,585 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
UMAP(min_dist=0.5, precomputed_knn=(array([[    0, 35281, 17697, ..., 27898, 20930, 28587],
       [    1, 21594, 31342, ..., 2926, 24935, 31029],
       [    2, 22804, 18671, ..., 30580, 11422, 1164],
       ...,
       [35462, 30699, 20801, ..., 30824, 2806, 6037],
       [35463, 5765, 30996, ..., 32077, 26634, 13143],
       [35464, 16222, 9302, ..., 14784, 17445, 10528]]), array([[0.        , 0.17744967, 0.17837791, ..., 0.18745066, 0.19058748,
        0.19085892],
       [0.        , 0.14216453, 0.14251806, ..., 0.15079266, 0.1508384 ,
        0.15128516],
       [0.        , 0.17242445, 0.17907873, ..., 0.19395164, 0.19539708,
        0.19570096],
       ...,
       [0.        , 0.14379458, 0.14453293, ..., 0.15746635, 0.15759753,
        0.15768066],
       [0.        , 0.12581435, 0.12616852, ..., 0.14605844, 0.1462014 ,
        0.14636491],
       [0.        , 0.12916026, 0.14846997, ..., 0.17055044, 0.17228062,
        0.1727689 ]], dtype=float32), <pegasus.tools.visualization.DummyNNDescent object at 0x7fe47016dc40>), random_state=0, verbose=True)
Sun Apr 10 18:07:05 2022 Construct fuzzy simplicial set
Sun Apr 10 18:07:05 2022 Construct embedding

Epochs completed:   0%|            0/200 [00:00]

Sun Apr 10 18:07:22 2022 Finished embedding
2022-04-10 18:07:22,549 - pegasus.tools.visualization - INFO - Function 'umap' finished in 16.96s.


pg.scatter(data_scan, attrs=['louvain_labels', 'Channel'], basis='umap')


_, _, kBET_baseline = pg.calc_kBET(data_baseline, attr='Channel', rep='umap')
_, _, kBET_harmony = pg.calc_kBET(data_harmony, attr='Channel', rep='umap')
_, _, kBET_inmf = pg.calc_kBET(data_inmf, attr='Channel', rep='umap')
_, _, kBET_scvi = pg.calc_kBET(data_scvi, attr='Channel', rep='umap')
_, _, kBET_scan = pg.calc_kBET(data_scan, attr='Channel', rep='umap')

2022-04-10 18:07:25,208 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 1.96s.
2022-04-10 18:07:35,337 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.03s.
2022-04-10 18:07:39,271 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.06s.
2022-04-10 18:07:43,202 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.00s.
2022-04-10 18:07:47,059 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 1.96s.


import pandas as pd
import numpy as np

df_celltypes = pd.read_csv("cell_types.csv", index_col='barcodekey')

assert np.sum(df_celltypes.index!=data_baseline.obs_names) == 0
data_baseline.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_harmony.obs_names) == 0
data_harmony.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_inmf.obs_names) == 0
data_inmf.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_scvi.obs_names) == 0
data_scvi.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_scan.obs_names) == 0
data_scan.obs['cell_types'] = df_celltypes['cell_types']


_, kSIM_baseline = pg.calc_kSIM(data_baseline, attr='cell_types', rep='umap')
_, kSIM_harmony = pg.calc_kSIM(data_harmony, attr='cell_types', rep='umap')
_, kSIM_inmf = pg.calc_kSIM(data_inmf, attr='cell_types', rep='umap')
_, kSIM_scvi = pg.calc_kSIM(data_scvi, attr='cell_types', rep='umap')
_, kSIM_scan = pg.calc_kSIM(data_scan, attr='cell_types', rep='umap')

2022-04-10 18:07:49,126 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:49,127 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2022-04-10 18:07:49,135 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:49,135 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2022-04-10 18:07:49,141 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:49,142 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2022-04-10 18:07:49,149 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:49,149 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2022-04-10 18:07:49,155 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2022-04-10 18:07:49,156 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.


import seaborn as sns
import matplotlib.pyplot as plt

df_plot = pd.DataFrame({'method': ['Baseline', 'Harmony', 'iNMF', 'scVI', 'Scanorama'],
                        'kBET': [kBET_baseline, kBET_harmony, kBET_inmf, kBET_scvi, kBET_scan],
                        'kSIM': [kSIM_baseline, kSIM_harmony, kSIM_inmf, kSIM_scvi, kSIM_scan]})

plt.figure(dpi=100)
ax = sns.scatterplot(x = 'kSIM', y = 'kBET', hue = 'method', data = df_plot, legend = False)
for line in range(0, df_plot.shape[0]):
    x_pos = df_plot.kSIM[line] + 0.003
    if df_plot.method[line] == 'Baseline':
        x_pos = df_plot.kSIM[line] - 0.003
    y_pos = df_plot.kBET[line]
    if df_plot.method[line] == 'iNMF':
        y_pos -= 0.01
    alignment = 'right' if df_plot.method[line] == 'Baseline' else 'left'
    ax.text(x_pos, y_pos, df_plot.method[line], ha = alignment, size = 'medium', color = 'black')
plt.xlabel('kSIM acceptance rate')
plt.ylabel('kBET acceptance rate')

Text(0, 0.5, 'kBET acceptance rate')

Batch Correction Tutorial¶

Test Environment¶

Dataset¶

Sections¶

Preprocessing¶

Clustering without Correcting Batch Effects¶

Batch Correction Methods¶

Harmony Algorithm¶

Integrative-NMF¶

scVI¶

Scanorama Algorithm¶

Comparing Batch Correction Methods¶

kBET Acceptance Rate¶

kSIM Acceptance Rate¶

Performance Plot¶