import csv import itertools import sys PROBS = { # Unconditional probabilities for having gene "gene": { 2: 0.01, 1: 0.03, 0: 0.96 }, "trait": { # Probability of trait given two copies of gene 2: { True: 0.65, False: 0.35 }, # Probability of trait given one copy of gene 1: { True: 0.56, False: 0.44 }, # Probability of trait given no gene 0: { True: 0.01, False: 0.99 } }, # Mutation probability "mutation": 0.01 } def main(): # Check for proper usage if len(sys.argv) != 2: sys.exit("Usage: python heredity.py data.csv") people = load_data(sys.argv[1]) # Keep track of gene and trait probabilities for each person probabilities = { person: { "gene": { 2: 0, 1: 0, 0: 0 }, "trait": { True: 0, False: 0 } } for person in people } # Loop over all sets of people who might have the trait names = set(people) for have_trait in powerset(names): # Check if current set of people violates known information fails_evidence = any( (people[person]["trait"] is not None and people[person]["trait"] != (person in have_trait)) for person in names ) if fails_evidence: continue # Loop over all sets of people who might have the gene for one_gene in powerset(names): for two_genes in powerset(names - one_gene): # Update probabilities with new joint probability p = joint_probability(people, one_gene, two_genes, have_trait) update(probabilities, one_gene, two_genes, have_trait, p) # Ensure probabilities sum to 1 normalize(probabilities) # Print results for person in people: print(f"{person}:") for field in probabilities[person]: print(f" {field.capitalize()}:") for value in probabilities[person][field]: p = probabilities[person][field][value] print(f" {value}: {p:.4f}") def load_data(filename): """ Load gene and trait data from a file into a dictionary. File assumed to be a CSV containing fields name, mother, father, trait. mother, father must both be blank, or both be valid names in the CSV. trait should be 0 or 1 if trait is known, blank otherwise. """ data = dict() with open(filename) as f: reader = csv.DictReader(f) for row in reader: name = row["name"] data[name] = { "name": name, "mother": row["mother"] or None, "father": row["father"] or None, "trait": (True if row["trait"] == "1" else False if row["trait"] == "0" else None) } return data def powerset(s): """ Return a list of all possible subsets of set s. """ s = list(s) return [ set(s) for s in itertools.chain.from_iterable( itertools.combinations(s, r) for r in range(len(s) + 1) ) ] def joint_probability(people, one_gene, two_genes, have_trait): """ Compute and return a joint probability. Args: - people is a dictionary of people. The keys represent names, and the values are dictionaries that contain mother and father keys. You may assume that either mother and father are both blank (no parental information in the data set), or mother and father will both refer to other people in the people dictionary. - one_gene is a set of all people for whom we want to compute the probability that they have one copy of the gene. - two_genes is a set of all people for whom we want to compute the probability that they have two copies of the gene. - have_trait is a set of all people for whom we want to compute the probability that they have the trait. The probability returned should be the probability that - everyone in set `one_gene` has one copy of the gene, and - everyone in set `two_genes` has two copies of the gene, and - everyone not in `one_gene` or `two_gene` does not have the gene, and - everyone in set `have_trait` has the trait, and - everyone not in set` have_trait` does not have the trait. and trait = {"Harry", "James"} should calculate the probability that Lily has zero copies of the gene, Harry has one copy of the gene, James has two copies of the gene, Harry exhibits the trait, James exhibits the trait, and Lily does not exhibit the trait. For anyone with no parents listed in the data set, use the probability distribution PROBS["gene"] to determine the probability that they have a particular number of the gene. For anyone with parents in the data set, each parent will pass one of their two genes on to their child randomly, and there is a PROBS["mutation"] chance that it mutates (goes from being the gene to not being the gene, or vice versa). Use the probability distribution PROBS["trait"] to compute the probability that a person does or does not have a particular trait. """ probability = 1 for person in people: # Check if the person has one copy of the gene if person in one_gene: # Calculate the probability of having one copy of the gene gene_prob = PROBS["gene"][1] # Check if the person has two copies of the gene elif person in two_genes: # Calculate the probability of having two copies of the gene gene_prob = PROBS["gene"][2] else: # Calculate the probability of not having the gene gene_prob = PROBS["gene"][0] # Check if the person has the trait if person in have_trait: # Calculate the probability of having the trait trait_prob = PROBS["trait"][gene_prob][True] else: # Calculate the probability of not having the trait trait_prob = PROBS["trait"][gene_prob][False] # Multiply the gene and trait probabilities probability *= gene_prob * trait_prob return probability def update(probabilities, one_gene, two_genes, have_trait, p): """ Add to `probabilities` a new joint probability `p`. Each person should have their "gene" and "trait" distributions updated. Which value for each distribution is updated depends on whether the person is in `have_gene` and `have_trait`, respectively. Args: - probabilities is a dictionary of people. Each person is mapped to a "gene" distribution and a "trait" distribution. - one_gene is a set of people with one copy of the gene in the current joint distribution. - two_genes is a set of people with two copies of the gene in the current joint distribution. - have_trait is a set of people with the trait in the current joint distribution. - p is the probability of the joint distribution. For each person person in `probabilities`, the function should update the probabilities[person]["gene"] distribution and probabilities[person]["trait"] distribution by adding p to the appropriate value in each distribution. All other values should be left unchanged. For example, if "Harry" were in both two_genes and in have_trait, then p would be added to probabilities["Harry"]["gene"][2] and to probabilities["Harry"]["trait"][True]. The function should not return any value: it just needs to update the probabilities dictionary. """ for person in probabilities: if person in one_gene: probabilities[person]["gene"][1] += p elif person in two_genes: probabilities[person]["gene"][2] += p else: probabilities[person]["gene"][0] += p if person in have_trait: probabilities[person]["trait"][True] += p else: probabilities[person]["trait"][False] += p def normalize(probabilities): """ Update `probabilities` such that each probability distribution is normalized (i.e., sums to 1, with relative proportions the same). Args: - probabilities is a dictionary of people. Each person is mapped to a "gene" distribution and a "trait" distribution. For both of the distributions for each person in probabilities, this function should normalize that distribution so that the values in the distribution sum to 1, and the relative values in the distribution are the same. For example, if probabilities["Harry"]["trait"][True] were equal to 0.1 and probabilities["Harry"]["trait"][False] were equal to 0.3, then your function should update the former value to be 0.25 and the latter value to be 0.75: the numbers now sum to 1, and the latter value is still three times larger than the former value. The function should not return any value: it just needs to update the probabilities dictionary. """ for person in probabilities: # Calculate the sum of the gene probabilities gene_sum = sum(probabilities[person]["gene"].values()) # Calculate the sum of the trait probabilities trait_sum = sum(probabilities[person]["trait"].values()) # Normalize the gene probabilities for gene in probabilities[person]["gene"]: probabilities[person]["gene"][gene] /= gene_sum # Normalize the trait probabilities for trait in probabilities[person]["trait"]: probabilities[person]["trait"][trait] /= trait_sum if __name__ == "__main__": main()