6. Properties per sequence#
Until now we obtained data for each molecule separately. Here we can combine the data per sequence.
[1]:
from pathlib2 import Path
import papylio as pp
import matplotlib.pyplot as plt
import tqdm
import numpy as np
import xarray as xr
from papylio.plugins.sequencing.sequence_generation import generate_sequences
%matplotlib inline
Experiment import#
Note that the sequencing data is automatically imported.
[2]:
experiment_path = Path(r'C:\Users\user\Desktop\SPARXS example dataset')
[3]:
data_per_sequence_path = experiment_path / 'Analysis' / 'Datasets per sequence'
data_per_sequence_path
[3]:
WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis/Datasets per sequence')
[4]:
save_path = experiment_path / 'Analysis'
save_path
[4]:
WindowsPath('C:/Users/user/Desktop/SPARXS example dataset/Analysis')
[5]:
exp = pp.Experiment(data_per_sequence_path)
Import files: 100%|█████████████████████████████████████████████████████████████| 7948/7948 [00:00<00:00, 42768.82it/s]
Initialize experiment:
C:\Users\user\Desktop\SPARXS example dataset\Analysis\Datasets per sequence
[6]:
file_HJ1 = exp.files.select('TTAGCCGA', 'name')[0]
file_HJ3 = exp.files.select('AATCGGCT', 'name')[0]
file_HJ7 = exp.files.select('GGCGCCGC', 'name')[0]
files_HJ137 = exp.files.select('TTAGCCGA|AATCGGCT|GGCGCCGC', 'name')
[7]:
files = files_HJ137
Create properties per sequence dataset#
Here we create a new dataset to store the properties per sequence.
[8]:
ds = xr.Dataset()
ds['sequence_subset'] = ['TTAGCCGA', 'AATCGGCT', 'GGCGCCGC']
# ds['sequence_subset'] = generate_sequences('NNNNNNNN') # When measuring all variations
ds.set_coords('sequence_subset')
ds.to_netcdf(save_path / 'properties_per_sequence.nc')
ds
[8]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3)
Coordinates:
* sequence_subset (sequence_subset) <U8 'TTAGCCGA' 'AATCGGCT' 'GGCGCCGC'
Data variables:
*empty*Number of molecules#
Add the total number of molecules and the number of selected molecules
[9]:
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')
number_of_molecules_selected = []
number_of_molecules = []
sequence_subset = []
for file in tqdm.tqdm(files):
number_of_molecules_selected.append(file.number_of_selected_molecules)
number_of_molecules.append(file.number_of_molecules)
sequence_subset.append(file.name)
ds['number_of_molecules'] = xr.DataArray(number_of_molecules, dims=('sequence_subset'),
coords={'sequence_subset': sequence_subset})
ds['number_of_molecules_selected'] = xr.DataArray(number_of_molecules_selected, dims=('sequence_subset'),
coords={'sequence_subset': sequence_subset})
ds.to_netcdf(save_path / 'properties_per_sequence.nc')
ds
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 21.23it/s]
[9]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3)
Coordinates:
* sequence_subset (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
number_of_molecules (sequence_subset) int32 669 1006 753
number_of_molecules_selected (sequence_subset) int32 241 284 238Number of states#
Add the number of molecules with one or two states, and the fraction of molecules that show two states.
[10]:
state_count = files.serial.state_count(selected=True, states=np.arange(3))
state_count = state_count.rename(name='sequence_subset')
state_count.to_netcdf(save_path / 'state_count.nc')
state_count
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.08it/s]
Serial processing
[10]:
<xarray.DataArray 'state_count' (sequence_subset: 3, number_of_states: 3)>
array([[ 12, 55, 217],
[ 8, 43, 187],
[ 1, 41, 199]])
Coordinates:
* sequence_subset (sequence_subset) <U8 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
* number_of_states (number_of_states) int32 0 1 2[11]:
state_count = xr.load_dataarray(save_path / 'state_count.nc')
count_1_state = state_count.sel(number_of_states=1, drop=True)
count_1_state.name = 'count_1_state'
count_1_state.attrs['unit'] = ''
count_2_states = state_count.sel(number_of_states=2, drop=True)
count_2_states.name = 'count_2_states'
count_2_states.attrs['unit'] = ''
fraction_2_states = count_2_states / (count_1_state + count_2_states)
fraction_2_states.name = 'fraction_2_states'
fraction_2_states.attrs['unit'] = ''
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')
ds['count_1_state'] = count_1_state
ds['count_2_states'] = count_2_states
ds['fraction_2_states'] = fraction_2_states
ds.to_netcdf(save_path / 'properties_per_sequence.nc')
ds
[11]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3)
Coordinates:
* sequence_subset (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
number_of_molecules (sequence_subset) int32 669 1006 753
number_of_molecules_selected (sequence_subset) int32 241 284 238
count_1_state (sequence_subset) int32 41 55 43
count_2_states (sequence_subset) int32 199 217 187
fraction_2_states (sequence_subset) float64 0.8292 0.7978 0.813Transition rates#
Add the mean and standard deviation of the transition rates among the molecules
[12]:
files.name.data
[12]:
['AATCGGCT', 'GGCGCCGC', 'TTAGCCGA']
[13]:
transition_rates_hmm = xr.Dataset(coords={'sequence_subset': files.name.data})
transition_rates_mean_hmm = files.get_variable('transition_rate', selected=True, average='molecule')
transition_rates_mean_hmm = transition_rates_mean_hmm.rename(name='sequence_subset')
transition_rates_mean_hmm.attrs['unit'] = '/s'
transition_rates_hmm['mean'] = transition_rates_mean_hmm
def transition_rate_std_hmm(file):
return file.get_variable('transition_rate', selected=True).std(dim='molecule').expand_dims({'sequence_subset': [file.name]}, 0)
transition_rates_std_hmm = xr.concat(files.map(transition_rate_std_hmm)(), dim='sequence_subset')
transition_rates_std_hmm.attrs['unit'] = '/s'
transition_rates_hmm['std'] = transition_rates_std_hmm
transition_rates_hmm.to_netcdf(save_path / 'transition_rates_hmm.nc')
transition_rates_hmm
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.42it/s]
0%| | 0/3 [00:00<?, ?it/s]
Serial processing
Serial processing
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.88it/s]
[13]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3, from_state: 2, to_state: 2)
Coordinates:
* sequence_subset (sequence_subset) object 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
Dimensions without coordinates: from_state, to_state
Data variables:
mean (sequence_subset, from_state, to_state) float32 -6.31 .....
std (sequence_subset, from_state, to_state) float32 4.638 .....[14]:
transition_rates_hmm = xr.load_dataset(save_path / 'transition_rates_hmm.nc')
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')
ds['transition_rate_0_1'] = transition_rates_hmm['mean'].sel(from_state=0, to_state=1, drop=True)
ds['transition_rate_0_1'].attrs['unit'] = '/s'
ds['transition_rate_1_0'] = transition_rates_hmm['mean'].sel(from_state=1, to_state=0, drop=True)
ds['transition_rate_1_0'].attrs['unit'] = '/s'
ds.to_netcdf(save_path / 'properties_per_sequence.nc')
ds
[14]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3)
Coordinates:
* sequence_subset (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables:
number_of_molecules (sequence_subset) int32 669 1006 753
number_of_molecules_selected (sequence_subset) int32 241 284 238
count_1_state (sequence_subset) int32 41 55 43
count_2_states (sequence_subset) int32 199 217 187
fraction_2_states (sequence_subset) float64 0.8292 0.7978 0.813
transition_rate_0_1 (sequence_subset) float32 3.722 7.909 3.355
transition_rate_1_0 (sequence_subset) float32 8.402 2.994 2.902FRET#
Add the mean and standard deviation of the FRET values for each state, with separate values for 1-state molecules and 2-state molecules.
[15]:
def generate_FRET_classification(file):
FRET_classification = xr.Dataset()
FRET_per_state = file.FRET.where(file.classification_binary(positive_states_only=True)).transpose(...,'frame')
FRET_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).mean(['molecule','frame'])
FRET_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).mean(['molecule','frame'])
FRET_std_1_state = FRET_per_state.sel(molecule=(file.number_of_states==1)&file.selected.values).std(['molecule','frame'])
FRET_std_2_states = FRET_per_state.sel(molecule=(file.number_of_states==2)&file.selected.values).std(['molecule','frame'])
FRET_classification['mean'] = xr.concat([FRET_1_state, FRET_2_states], dim='number_of_states')\
.assign_coords(number_of_states=[1,2])
FRET_classification['std'] = xr.concat([FRET_std_1_state, FRET_std_2_states], dim='number_of_states')\
.assign_coords(number_of_states=[1,2])
FRET_classification
return FRET_classification
FRET_classification = xr.concat(files.serial.map(generate_FRET_classification)(), dim='name')\
.assign_coords(sequence_subset=files.name)
FRET_classification = FRET_classification.rename(name='sequence_subset')
FRET_classification.to_netcdf(save_path / 'FRET_classification.nc')
FRET_classification
0%| | 0/3 [00:00<?, ?it/s]
Serial processing
C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
33%|████████████████████████████ | 1/3 [00:00<00:00, 3.26it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
67%|████████████████████████████████████████████████████████ | 2/3 [00:00<00:00, 3.54it/s]C:\Users\user\miniconda3\envs\trace_analysis\lib\site-packages\numpy\lib\nanfunctions.py:1879: RuntimeWarning: Degrees of freedom <= 0 for slice.
var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 3.60it/s]
[15]:
<xarray.Dataset>
Dimensions: (state: 2, number_of_states: 2, sequence_subset: 3)
Coordinates:
* state (state) int8 0 1
* number_of_states (number_of_states) int32 1 2
* sequence_subset (sequence_subset) <U8 'AATCGGCT' 'GGCGCCGC' 'TTAGCCGA'
Data variables:
mean (sequence_subset, number_of_states, state) float64 0.22...
std (sequence_subset, number_of_states, state) float64 0.18...[16]:
FRET_classification = xr.load_dataset(save_path / 'FRET_classification.nc')
ds = xr.load_dataset(save_path / 'properties_per_sequence.nc')
ds['FRET_1_state'] = FRET_classification['mean'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_std_1_state'] = FRET_classification['std'].sel(number_of_states=1, state=0, drop=True)
ds['FRET_2_states_0'] = FRET_classification['mean'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_2_states_1'] = FRET_classification['mean'].sel(number_of_states=2, state=1, drop=True)
ds['FRET_std_2_states_0'] = FRET_classification['std'].sel(number_of_states=2, state=0, drop=True)
ds['FRET_std_2_states_1'] = FRET_classification['std'].sel(number_of_states=2, state=1, drop=True)
ds.to_netcdf(save_path / 'properties_per_sequence.nc')
ds
[16]:
<xarray.Dataset>
Dimensions: (sequence_subset: 3)
Coordinates:
* sequence_subset (sequence_subset) object 'TTAGCCGA' ... 'GG...
Data variables: (12/13)
number_of_molecules (sequence_subset) int32 669 1006 753
number_of_molecules_selected (sequence_subset) int32 241 284 238
count_1_state (sequence_subset) int32 41 55 43
count_2_states (sequence_subset) int32 199 217 187
fraction_2_states (sequence_subset) float64 0.8292 0.7978 0.813
transition_rate_0_1 (sequence_subset) float32 3.722 7.909 3.355
... ...
FRET_1_state (sequence_subset) float64 0.197 0.2284 0.2214
FRET_std_1_state (sequence_subset) float64 0.1259 0.181 0.1568
FRET_2_states_0 (sequence_subset) float64 0.1861 0.2661 0.2003
FRET_2_states_1 (sequence_subset) float64 0.4251 0.5078 0.472
FRET_std_2_states_0 (sequence_subset) float64 0.08322 ... 0.09612
FRET_std_2_states_1 (sequence_subset) float64 0.1324 ... 0.1094The data can be easily converted to an excel sheet.
[17]:
ds.to_pandas().to_excel(save_path / 'properties_per_sequence.xlsx')
ds.to_pandas()
[17]:
| number_of_molecules | number_of_molecules_selected | count_1_state | count_2_states | fraction_2_states | transition_rate_0_1 | transition_rate_1_0 | FRET_1_state | FRET_std_1_state | FRET_2_states_0 | FRET_2_states_1 | FRET_std_2_states_0 | FRET_std_2_states_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sequence_subset | |||||||||||||
| TTAGCCGA | 669 | 241 | 41 | 199 | 0.829167 | 3.722390 | 8.402450 | 0.196950 | 0.125939 | 0.186073 | 0.425090 | 0.083220 | 0.132375 |
| AATCGGCT | 1006 | 284 | 55 | 217 | 0.797794 | 7.908979 | 2.993645 | 0.228415 | 0.181038 | 0.266083 | 0.507820 | 0.119068 | 0.096357 |
| GGCGCCGC | 753 | 238 | 43 | 187 | 0.813043 | 3.355469 | 2.901715 | 0.221436 | 0.156791 | 0.200279 | 0.471952 | 0.096120 | 0.109417 |