问题
I'm facing sometimes Memory Error
, sometimes it goes through fine and sometimes it pops up.. Specifically when trying to subtract large array by one. I tried many ways to do this subtraction, is there any way to avoid this? and is my other code parts will also sometime arise this error?
Here is my code:
def home(request):
if request.method=="POST":
img = UploadForm(request.POST, request.FILES)
no_clus = int(request.POST.get('num_clusters', 10))
if img.is_valid():
paramFile =io.TextIOWrapper(request.FILES['pic'].file)
portfolio1 = csv.DictReader(paramFile)
users = []
users = [row["BASE_NAME"] for row in portfolio1]
print(len(users))
my_list = users
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(my_list)
lsa = TruncatedSVD(n_components=100)
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
dist1 = (1- np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))
# print(1-similarity)
k = len(my_list)
# dist1 = (1- similarity)
# dist1=similarity
# dist1.astype(float)
#print(dist1)
# print(cosine_similarity(tfidf_matrix[3:4], tfidf_matrix))
# float dist = 1 - similarity;
data2 = np.asarray(dist1)
arr_3d = data2.reshape((1, k, k))
# arr_3d= 1- arr_3d
#print(arr_3d)
no_cluster = number_cluster(len(my_list))
print(no_cluster)
for i in range(len(arr_3d)):
# print (i+1910)
# km = AgglomerativeClustering(n_clusters=no_clus, linkage='ward').fit(arr_3d[i])
km = AgglomerativeClustering(n_clusters=no_cluster, linkage='average').fit(arr_3d[i])
# km = AgglomerativeClustering(n_clusters=no_clus, linkage='complete').fit(arr_3d[i])
# km = MeanShift()
# km = KMeans(n_clusters=no_clus, init='k-means++')
# km = MeanShift()
# km = km.fit(arr_3d[i])
# print km
labels = km.labels_
csvfile = settings.MEDIA_ROOT +'\\'+ 'images\\export.csv'
csv_input = pd.read_csv(csvfile, encoding='latin-1')
csv_input['cluster_ID'] = labels
csv_input['BASE_NAME'] = my_list
csv_input.to_csv(settings.MEDIA_ROOT +'/'+ 'output.csv', index=False)
clus_groups = list()
for j in range(no_cluster):
# print(" cluster no %i:%s" % (j, [my_list[i] for i, x in enumerate(labels) if x == j]))
list_of_ints = ([my_list[i] for i, x in enumerate(labels) if x == j])
clus_groups.append(' '.join(list_of_ints))
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(my_list)
lsa = TruncatedSVD(n_components=100)
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
dist1 = (1 - np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))
# similarity = np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T)
k = len(my_list)
# dist1 = 1 - similarity
data2 = np.asarray(dist1)
arr_3d = data2.reshape((1, k, k))
# arr_3d= 1- arr_3d
#no_clus = 5
# no_clus=get_name(request)
for i in range(len(arr_3d)):
# print (i+1910)
# km = AgglomerativeClustering(n_clusters=no_clus, linkage='ward').fit(arr_3d[i])
# km = AgglomerativeClustering(n_clusters=no_clus, linkage='average').fit(arr_3d[i])
# km = AgglomerativeClustering(n_clusters=no_clus, linkage='complete').fit(arr_3d[i])
km = KMeans(n_clusters=no_clus, init='k-means++')
km = km.fit(arr_3d[i])
# print km
labels2 = km.labels_
# error = km.inertia_
print(labels2)
labels = labels.tolist()
labels2 = labels2.tolist()
# new=list()
csv_input = pd.read_csv(settings.MEDIA_ROOT +'/'+ 'output.csv',encoding='latin-1')
labels1 = csv_input['cluster_ID']
new_list = []
for k in labels1:
new_list.append(labels2[k]) # lookup the value in list2 at the index given by list1
print(new_list)
print(len(new_list))
csv_input = pd.read_csv(settings.MEDIA_ROOT +'/'+ 'output.csv',encoding='latin-1')
csv_input['cluster_ID'] = labels
csv_input['BASE_NAME'] = my_list
csv_input['User_Map'] = new_list
csv_input.to_csv(settings.MEDIA_ROOT + '/' + 'output1.csv', index=False)
#filename= settings.MEDIA_ROOT +'/'+ 'output.csv'
send_file(request)
# my_list = portfolio
#save_file('output1.csv')
# csv(request)
# return HttpResponseRedirect(reverse('labels'))
return render(request, 'new.html', {'labels': labels})
else:
img=UploadForm()
images=Upload.objects.all()
return render(request,'new.html',{'form':img,'images':images})
the error
is when trying to do dist1 = (1- np.asarray(numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T))
.. I also tried to create new array with all ones with the same size and then subtract.. How should I modify this to prevent this error? Note that the user interface that will run this code can be operated on any pc!
回答1:
Not sure, but in the incriminated line you use numpy.asmatrix(dtm_lsa), which is a function call that allocates some memory.
You're doing it twice so it creates twice too much memory (that's before it's garbage collected, but it's too late in some cases)
(not patronizing you at all: That's the common trap in mathematical formulae, but those formulae must be adapted when programmed into a computer).
I would suggest to replace the line by those lines:
temp_matrix = numpy.asmatrix(dtm_lsa)
product = temp_matrix * temp_matrix.T
# maybe call the garbage collector at this point: gc.collect()
dist1 = (1- np.asarray(product))
That way 1) less copy/paste and 2) not a lot of big matrix allocations in a single line
来源:https://stackoverflow.com/questions/37844596/avoid-memory-error-when-dealing-with-large-arrays